From 992c238188a83befa0094a8c00bfead31aa302ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Wed, 28 Jul 2021 19:51:50 +0200
Subject: dma-buf: nuke seqno-fence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Entirely unused.

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210729070330.41443-1-christian.koenig@amd.com
---
 include/linux/seqno-fence.h | 109 --------------------------------------------
 1 file changed, 109 deletions(-)
 delete mode 100644 include/linux/seqno-fence.h

(limited to 'include/linux')

diff --git a/include/linux/seqno-fence.h b/include/linux/seqno-fence.h
deleted file mode 100644
index 3cca2b8fac43..000000000000
--- a/include/linux/seqno-fence.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * seqno-fence, using a dma-buf to synchronize fencing
- *
- * Copyright (C) 2012 Texas Instruments
- * Copyright (C) 2012 Canonical Ltd
- * Authors:
- *   Rob Clark <robdclark@gmail.com>
- *   Maarten Lankhorst <maarten.lankhorst@canonical.com>
- */
-
-#ifndef __LINUX_SEQNO_FENCE_H
-#define __LINUX_SEQNO_FENCE_H
-
-#include <linux/dma-fence.h>
-#include <linux/dma-buf.h>
-
-enum seqno_fence_condition {
-	SEQNO_FENCE_WAIT_GEQUAL,
-	SEQNO_FENCE_WAIT_NONZERO
-};
-
-struct seqno_fence {
-	struct dma_fence base;
-
-	const struct dma_fence_ops *ops;
-	struct dma_buf *sync_buf;
-	uint32_t seqno_ofs;
-	enum seqno_fence_condition condition;
-};
-
-extern const struct dma_fence_ops seqno_fence_ops;
-
-/**
- * to_seqno_fence - cast a fence to a seqno_fence
- * @fence: fence to cast to a seqno_fence
- *
- * Returns NULL if the fence is not a seqno_fence,
- * or the seqno_fence otherwise.
- */
-static inline struct seqno_fence *
-to_seqno_fence(struct dma_fence *fence)
-{
-	if (fence->ops != &seqno_fence_ops)
-		return NULL;
-	return container_of(fence, struct seqno_fence, base);
-}
-
-/**
- * seqno_fence_init - initialize a seqno fence
- * @fence: seqno_fence to initialize
- * @lock: pointer to spinlock to use for fence
- * @sync_buf: buffer containing the memory location to signal on
- * @context: the execution context this fence is a part of
- * @seqno_ofs: the offset within @sync_buf
- * @seqno: the sequence # to signal on
- * @cond: fence wait condition
- * @ops: the fence_ops for operations on this seqno fence
- *
- * This function initializes a struct seqno_fence with passed parameters,
- * and takes a reference on sync_buf which is released on fence destruction.
- *
- * A seqno_fence is a dma_fence which can complete in software when
- * enable_signaling is called, but it also completes when
- * (s32)((sync_buf)[seqno_ofs] - seqno) >= 0 is true
- *
- * The seqno_fence will take a refcount on the sync_buf until it's
- * destroyed, but actual lifetime of sync_buf may be longer if one of the
- * callers take a reference to it.
- *
- * Certain hardware have instructions to insert this type of wait condition
- * in the command stream, so no intervention from software would be needed.
- * This type of fence can be destroyed before completed, however a reference
- * on the sync_buf dma-buf can be taken. It is encouraged to re-use the same
- * dma-buf for sync_buf, since mapping or unmapping the sync_buf to the
- * device's vm can be expensive.
- *
- * It is recommended for creators of seqno_fence to call dma_fence_signal()
- * before destruction. This will prevent possible issues from wraparound at
- * time of issue vs time of check, since users can check dma_fence_is_signaled()
- * before submitting instructions for the hardware to wait on the fence.
- * However, when ops.enable_signaling is not called, it doesn't have to be
- * done as soon as possible, just before there's any real danger of seqno
- * wraparound.
- */
-static inline void
-seqno_fence_init(struct seqno_fence *fence, spinlock_t *lock,
-		 struct dma_buf *sync_buf,  uint32_t context,
-		 uint32_t seqno_ofs, uint32_t seqno,
-		 enum seqno_fence_condition cond,
-		 const struct dma_fence_ops *ops)
-{
-	BUG_ON(!fence || !sync_buf || !ops);
-	BUG_ON(!ops->wait || !ops->enable_signaling ||
-	       !ops->get_driver_name || !ops->get_timeline_name);
-
-	/*
-	 * ops is used in dma_fence_init for get_driver_name, so needs to be
-	 * initialized first
-	 */
-	fence->ops = ops;
-	dma_fence_init(&fence->base, &seqno_fence_ops, lock, context, seqno);
-	get_dma_buf(sync_buf);
-	fence->sync_buf = sync_buf;
-	fence->seqno_ofs = seqno_ofs;
-	fence->condition = cond;
-}
-
-#endif /* __LINUX_SEQNO_FENCE_H */
-- 
cgit v1.2.3


From 880121be1179a0db3eb4fdb198108d9475b8be4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <ckoenig.leichtzumerken@gmail.com>
Date: Thu, 15 Apr 2021 13:56:23 +0200
Subject: mm/vmscan: add sync_shrinkers function v3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While unplugging a device the TTM shrinker implementation
needs a barrier to make sure that all concurrent shrink
operations are done and no other CPU is referring to a
device specific pool any more.

Taking and releasing the shrinker semaphore on the write
side after unmapping and freeing all pages from the device
pool should make sure that no shrinker is running in
paralell.

This allows us to avoid the contented mutex in the TTM pool
implementation for every alloc/free operation.

v2: rework the commit message to make clear why we need this
v3: rename the function and add more doc as suggested by Daniel

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Huang Rui <ray.huang@amd.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210820120528.81114-2-christian.koenig@amd.com
---
 include/linux/shrinker.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 9814fff58a69..76fbf92b04d9 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -93,4 +93,5 @@ extern void register_shrinker_prepared(struct shrinker *shrinker);
 extern int register_shrinker(struct shrinker *shrinker);
 extern void unregister_shrinker(struct shrinker *shrinker);
 extern void free_prealloced_shrinker(struct shrinker *shrinker);
+extern void synchronize_shrinkers(void);
 #endif
-- 
cgit v1.2.3


From d9edf92d496b61e5ac75b2b0aba5ea6c7f7ecdca Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 5 Aug 2021 12:47:05 +0200
Subject: dma-resv: Give the docs a do-over
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Specifically document the new/clarified rules around how the shared
fences do not have any ordering requirements against the exclusive
fence.

But also document all the things a bit better, given how central
struct dma_resv to dynamic buffer management the docs have been very
inadequat.

- Lots more links to other pieces of the puzzle. Unfortunately
  ttm_buffer_object has no docs, so no links :-(

- Explain/complain a bit about dma_resv_locking_ctx(). I still don't
  like that one, but fixing the ttm call chains is going to be
  horrible. Plus we want to plug in real slowpath locking when we do
  that anyway.

- Main part of the patch is some actual docs for struct dma_resv.

Overall I think we still have a lot of bad naming in this area (e.g.
dma_resv.fence is singular, but contains the multiple shared fences),
but I think that's more indicative of how the semantics and rules are
just not great.

Another thing that's real awkard is how chaining exclusive fences
right now means direct dma_resv.exclusive_fence pointer access with an
rcu_assign_pointer. Not so great either.

v2:
- Fix a pile of typos (Matt, Jason)
- Hammer it in that breaking the rules leads to use-after-free issues
  around dma-buf sharing (Christian)

Reviewed-by: Christian König <christian.koenig@amd.com>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Cc: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: "Christian König" <christian.koenig@amd.com>
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Link: https://patchwork.freedesktop.org/patch/msgid/20210805104705.862416-21-daniel.vetter@ffwll.ch
---
 include/linux/dma-buf.h  |   7 ++++
 include/linux/dma-resv.h | 104 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 106 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 8b32b4bdd590..66470c37e471 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -420,6 +420,13 @@ struct dma_buf {
 	 * - Dynamic importers should set fences for any access that they can't
 	 *   disable immediately from their &dma_buf_attach_ops.move_notify
 	 *   callback.
+	 *
+	 * IMPORTANT:
+	 *
+	 * All drivers must obey the struct dma_resv rules, specifically the
+	 * rules for updating fences, see &dma_resv.fence_excl and
+	 * &dma_resv.fence. If these dependency rules are broken access tracking
+	 * can be lost resulting in use after free issues.
 	 */
 	struct dma_resv *resv;
 
diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index e1ca2080a1ff..9100dd3dc21f 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -62,16 +62,90 @@ struct dma_resv_list {
 
 /**
  * struct dma_resv - a reservation object manages fences for a buffer
- * @lock: update side lock
- * @seq: sequence count for managing RCU read-side synchronization
- * @fence_excl: the exclusive fence, if there is one currently
- * @fence: list of current shared fences
+ *
+ * There are multiple uses for this, with sometimes slightly different rules in
+ * how the fence slots are used.
+ *
+ * One use is to synchronize cross-driver access to a struct dma_buf, either for
+ * dynamic buffer management or just to handle implicit synchronization between
+ * different users of the buffer in userspace. See &dma_buf.resv for a more
+ * in-depth discussion.
+ *
+ * The other major use is to manage access and locking within a driver in a
+ * buffer based memory manager. struct ttm_buffer_object is the canonical
+ * example here, since this is where reservation objects originated from. But
+ * use in drivers is spreading and some drivers also manage struct
+ * drm_gem_object with the same scheme.
  */
 struct dma_resv {
+	/**
+	 * @lock:
+	 *
+	 * Update side lock. Don't use directly, instead use the wrapper
+	 * functions like dma_resv_lock() and dma_resv_unlock().
+	 *
+	 * Drivers which use the reservation object to manage memory dynamically
+	 * also use this lock to protect buffer object state like placement,
+	 * allocation policies or throughout command submission.
+	 */
 	struct ww_mutex lock;
+
+	/**
+	 * @seq:
+	 *
+	 * Sequence count for managing RCU read-side synchronization, allows
+	 * read-only access to @fence_excl and @fence while ensuring we take a
+	 * consistent snapshot.
+	 */
 	seqcount_ww_mutex_t seq;
 
+	/**
+	 * @fence_excl:
+	 *
+	 * The exclusive fence, if there is one currently.
+	 *
+	 * There are two ways to update this fence:
+	 *
+	 * - First by calling dma_resv_add_excl_fence(), which replaces all
+	 *   fences attached to the reservation object. To guarantee that no
+	 *   fences are lost, this new fence must signal only after all previous
+	 *   fences, both shared and exclusive, have signalled. In some cases it
+	 *   is convenient to achieve that by attaching a struct dma_fence_array
+	 *   with all the new and old fences.
+	 *
+	 * - Alternatively the fence can be set directly, which leaves the
+	 *   shared fences unchanged. To guarantee that no fences are lost, this
+	 *   new fence must signal only after the previous exclusive fence has
+	 *   signalled. Since the shared fences are staying intact, it is not
+	 *   necessary to maintain any ordering against those. If semantically
+	 *   only a new access is added without actually treating the previous
+	 *   one as a dependency the exclusive fences can be strung together
+	 *   using struct dma_fence_chain.
+	 *
+	 * Note that actual semantics of what an exclusive or shared fence mean
+	 * is defined by the user, for reservation objects shared across drivers
+	 * see &dma_buf.resv.
+	 */
 	struct dma_fence __rcu *fence_excl;
+
+	/**
+	 * @fence:
+	 *
+	 * List of current shared fences.
+	 *
+	 * There are no ordering constraints of shared fences against the
+	 * exclusive fence slot. If a waiter needs to wait for all access, it
+	 * has to wait for both sets of fences to signal.
+	 *
+	 * A new fence is added by calling dma_resv_add_shared_fence(). Since
+	 * this often needs to be done past the point of no return in command
+	 * submission it cannot fail, and therefore sufficient slots need to be
+	 * reserved by calling dma_resv_reserve_shared().
+	 *
+	 * Note that actual semantics of what an exclusive or shared fence mean
+	 * is defined by the user, for reservation objects shared across drivers
+	 * see &dma_buf.resv.
+	 */
 	struct dma_resv_list __rcu *fence;
 };
 
@@ -98,6 +172,13 @@ static inline void dma_resv_reset_shared_max(struct dma_resv *obj) {}
  * undefined order, a #ww_acquire_ctx is passed to unwind if a cycle
  * is detected. See ww_mutex_lock() and ww_acquire_init(). A reservation
  * object may be locked by itself by passing NULL as @ctx.
+ *
+ * When a die situation is indicated by returning -EDEADLK all locks held by
+ * @ctx must be unlocked and then dma_resv_lock_slow() called on @obj.
+ *
+ * Unlocked by calling dma_resv_unlock().
+ *
+ * See also dma_resv_lock_interruptible() for the interruptible variant.
  */
 static inline int dma_resv_lock(struct dma_resv *obj,
 				struct ww_acquire_ctx *ctx)
@@ -119,6 +200,12 @@ static inline int dma_resv_lock(struct dma_resv *obj,
  * undefined order, a #ww_acquire_ctx is passed to unwind if a cycle
  * is detected. See ww_mutex_lock() and ww_acquire_init(). A reservation
  * object may be locked by itself by passing NULL as @ctx.
+ *
+ * When a die situation is indicated by returning -EDEADLK all locks held by
+ * @ctx must be unlocked and then dma_resv_lock_slow_interruptible() called on
+ * @obj.
+ *
+ * Unlocked by calling dma_resv_unlock().
  */
 static inline int dma_resv_lock_interruptible(struct dma_resv *obj,
 					      struct ww_acquire_ctx *ctx)
@@ -134,6 +221,8 @@ static inline int dma_resv_lock_interruptible(struct dma_resv *obj,
  * Acquires the reservation object after a die case. This function
  * will sleep until the lock becomes available. See dma_resv_lock() as
  * well.
+ *
+ * See also dma_resv_lock_slow_interruptible() for the interruptible variant.
  */
 static inline void dma_resv_lock_slow(struct dma_resv *obj,
 				      struct ww_acquire_ctx *ctx)
@@ -167,7 +256,7 @@ static inline int dma_resv_lock_slow_interruptible(struct dma_resv *obj,
  * if they overlap with a writer.
  *
  * Also note that since no context is provided, no deadlock protection is
- * possible.
+ * possible, which is also not needed for a trylock.
  *
  * Returns true if the lock was acquired, false otherwise.
  */
@@ -193,6 +282,11 @@ static inline bool dma_resv_is_locked(struct dma_resv *obj)
  *
  * Returns the context used to lock a reservation object or NULL if no context
  * was used or the object is not locked at all.
+ *
+ * WARNING: This interface is pretty horrible, but TTM needs it because it
+ * doesn't pass the struct ww_acquire_ctx around in some very long callchains.
+ * Everyone else just uses it to check whether they're holding a reservation or
+ * not.
  */
 static inline struct ww_acquire_ctx *dma_resv_locking_ctx(struct dma_resv *obj)
 {
-- 
cgit v1.2.3


From d72277b6c37db66b457fd6b77aabd5e930d58687 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Thu, 29 Jul 2021 08:39:31 +0200
Subject: dma-buf: nuke DMA_FENCE_TRACE macros v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Only the DRM GPU scheduler, radeon and amdgpu where using them and they depend
on a non existing config option to actually emit some code.

v2: keep the signal path as is for now

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210818105443.1578-1-christian.koenig@amd.com
---
 include/linux/dma-fence.h | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 6ffb4b2c6371..4cc119ab272f 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -590,26 +590,4 @@ struct dma_fence *dma_fence_get_stub(void);
 struct dma_fence *dma_fence_allocate_private_stub(void);
 u64 dma_fence_context_alloc(unsigned num);
 
-#define DMA_FENCE_TRACE(f, fmt, args...) \
-	do {								\
-		struct dma_fence *__ff = (f);				\
-		if (IS_ENABLED(CONFIG_DMA_FENCE_TRACE))			\
-			pr_info("f %llu#%llu: " fmt,			\
-				__ff->context, __ff->seqno, ##args);	\
-	} while (0)
-
-#define DMA_FENCE_WARN(f, fmt, args...) \
-	do {								\
-		struct dma_fence *__ff = (f);				\
-		pr_warn("f %llu#%llu: " fmt, __ff->context, __ff->seqno,\
-			 ##args);					\
-	} while (0)
-
-#define DMA_FENCE_ERR(f, fmt, args...) \
-	do {								\
-		struct dma_fence *__ff = (f);				\
-		pr_err("f %llu#%llu: " fmt, __ff->context, __ff->seqno,	\
-			##args);					\
-	} while (0)
-
 #endif /* __LINUX_DMA_FENCE_H */
-- 
cgit v1.2.3


From b83dcd753dbe42d5e7467ab65124f3d0a6002dc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Wed, 21 Jul 2021 11:01:04 +0200
Subject: dma-buf: clarify dma_fence_ops->wait documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This callback is pretty much deprecated and should not be used by new implementations.

Clarify that in the documentation as well.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210901120240.7339-2-christian.koenig@amd.com
---
 include/linux/dma-fence.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 4cc119ab272f..a706b7bf51d7 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -214,19 +214,15 @@ struct dma_fence_ops {
 	 * Custom wait implementation, defaults to dma_fence_default_wait() if
 	 * not set.
 	 *
-	 * The dma_fence_default_wait implementation should work for any fence, as long
-	 * as @enable_signaling works correctly. This hook allows drivers to
-	 * have an optimized version for the case where a process context is
-	 * already available, e.g. if @enable_signaling for the general case
-	 * needs to set up a worker thread.
+	 * Deprecated and should not be used by new implementations. Only used
+	 * by existing implementations which need special handling for their
+	 * hardware reset procedure.
 	 *
 	 * Must return -ERESTARTSYS if the wait is intr = true and the wait was
 	 * interrupted, and remaining jiffies if fence has signaled, or 0 if wait
 	 * timed out. Can also return other error values on custom implementations,
 	 * which should be treated as if the fence is signaled. For example a hardware
 	 * lockup could be reported like that.
-	 *
-	 * This callback is optional.
 	 */
 	signed long (*wait)(struct dma_fence *fence,
 			    bool intr, signed long timeout);
-- 
cgit v1.2.3


From cb19c107979b8000825060cb5b84dfe9212f4f86 Mon Sep 17 00:00:00 2001
From: Yongqiang Niu <yongqiang.niu@mediatek.com>
Date: Mon, 2 Aug 2021 16:59:32 +0800
Subject: soc: mediatek: mmsys: add comp OVL_2L2/POSTMASK/RDMA4

This patch add some more ddp component
OVL_2L2 is ovl which include 2 layers overlay
POSTMASK control round corner for display frame
RDMA4 read dma buffer

Signed-off-by: Yongqiang Niu <yongqiang.niu@mediatek.com>
Reviewed-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Reviewed-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/1627894773-23872-2-git-send-email-yongqiang.niu@mediatek.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 include/linux/soc/mediatek/mtk-mmsys.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h
index 2228bf6133da..4bba275e235a 100644
--- a/include/linux/soc/mediatek/mtk-mmsys.h
+++ b/include/linux/soc/mediatek/mtk-mmsys.h
@@ -29,13 +29,16 @@ enum mtk_ddp_comp_id {
 	DDP_COMPONENT_OVL0,
 	DDP_COMPONENT_OVL_2L0,
 	DDP_COMPONENT_OVL_2L1,
+	DDP_COMPONENT_OVL_2L2,
 	DDP_COMPONENT_OVL1,
+	DDP_COMPONENT_POSTMASK0,
 	DDP_COMPONENT_PWM0,
 	DDP_COMPONENT_PWM1,
 	DDP_COMPONENT_PWM2,
 	DDP_COMPONENT_RDMA0,
 	DDP_COMPONENT_RDMA1,
 	DDP_COMPONENT_RDMA2,
+	DDP_COMPONENT_RDMA4,
 	DDP_COMPONENT_UFOE,
 	DDP_COMPONENT_WDMA0,
 	DDP_COMPONENT_WDMA1,
-- 
cgit v1.2.3


From f55e36d5ab76c3097ff36ecea60b91c6b0d80fc8 Mon Sep 17 00:00:00 2001
From: Shai Malin <smalin@marvell.com>
Date: Mon, 13 Sep 2021 10:50:24 +0300
Subject: qed: Improve the stack space of filter_config()

As it was reported and discussed in: https://lore.kernel.org/lkml/CAHk-=whF9F89vsfH8E9TGc0tZA-yhzi2Di8wOtquNB5vRkFX5w@mail.gmail.com/
This patch improves the stack space of qede_config_rx_mode() by
splitting filter_config() to 3 functions and removing the
union qed_filter_type_params.

Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_eth_if.h | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 812a4d751163..4df0bf0a0864 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -145,12 +145,6 @@ struct qed_filter_mcast_params {
 	unsigned char mac[64][ETH_ALEN];
 };
 
-union qed_filter_type_params {
-	enum qed_filter_rx_mode_type accept_flags;
-	struct qed_filter_ucast_params ucast;
-	struct qed_filter_mcast_params mcast;
-};
-
 enum qed_filter_type {
 	QED_FILTER_TYPE_UCAST,
 	QED_FILTER_TYPE_MCAST,
@@ -158,11 +152,6 @@ enum qed_filter_type {
 	QED_MAX_FILTER_TYPES,
 };
 
-struct qed_filter_params {
-	enum qed_filter_type type;
-	union qed_filter_type_params filter;
-};
-
 struct qed_tunn_params {
 	u16 vxlan_port;
 	u8 update_vxlan_port;
@@ -314,8 +303,14 @@ struct qed_eth_ops {
 
 	int (*q_tx_stop)(struct qed_dev *cdev, u8 rss_id, void *handle);
 
-	int (*filter_config)(struct qed_dev *cdev,
-			     struct qed_filter_params *params);
+	int (*filter_config_rx_mode)(struct qed_dev *cdev,
+				     enum qed_filter_rx_mode_type type);
+
+	int (*filter_config_ucast)(struct qed_dev *cdev,
+				   struct qed_filter_ucast_params *params);
+
+	int (*filter_config_mcast)(struct qed_dev *cdev,
+				   struct qed_filter_mcast_params *params);
 
 	int (*fastpath_stop)(struct qed_dev *cdev);
 
-- 
cgit v1.2.3


From 537d3af1bee8ad1415fda9b622d1ea6d1ae76dfa Mon Sep 17 00:00:00 2001
From: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Date: Mon, 12 Jul 2021 14:39:12 +0200
Subject: rpmsg: Fix rpmsg_create_ept return when RPMSG config is not defined

According to the description of the rpmsg_create_ept in rpmsg_core.c
the function should return NULL on error.

Fixes: 2c8a57088045 ("rpmsg: Provide function stubs for API")
Signed-off-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20210712123912.10672-1-arnaud.pouliquen@foss.st.com
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/rpmsg.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index d97dcd049f18..a8dcf8a9ae88 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -231,7 +231,7 @@ static inline struct rpmsg_endpoint *rpmsg_create_ept(struct rpmsg_device *rpdev
 	/* This shouldn't be possible */
 	WARN_ON(1);
 
-	return ERR_PTR(-ENXIO);
+	return NULL;
 }
 
 static inline int rpmsg_send(struct rpmsg_endpoint *ept, void *data, int len)
-- 
cgit v1.2.3


From c22ac2a3d4bd83411ebf0b1726e9e5fc4f5e7ebf Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 10 Sep 2021 11:33:50 -0700
Subject: perf: Enable branch record for software events

The typical way to access branch record (e.g. Intel LBR) is via hardware
perf_event. For CPUs with FREEZE_LBRS_ON_PMI support, PMI could capture
reliable LBR. On the other hand, LBR could also be useful in non-PMI
scenario. For example, in kretprobe or bpf fexit program, LBR could
provide a lot of information on what happened with the function. Add API
to use branch record for software use.

Note that, when the software event triggers, it is necessary to stop the
branch record hardware asap. Therefore, static_call is used to remove some
branch instructions in this process.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/20210910183352.3151445-2-songliubraving@fb.com
---
 include/linux/perf_event.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fe156a8170aa..0cbc5dfe1110 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -57,6 +57,7 @@ struct perf_guest_info_callbacks {
 #include <linux/cgroup.h>
 #include <linux/refcount.h>
 #include <linux/security.h>
+#include <linux/static_call.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -1612,4 +1613,26 @@ extern void __weak arch_perf_update_userpage(struct perf_event *event,
 extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr);
 #endif
 
+/*
+ * Snapshot branch stack on software events.
+ *
+ * Branch stack can be very useful in understanding software events. For
+ * example, when a long function, e.g. sys_perf_event_open, returns an
+ * errno, it is not obvious why the function failed. Branch stack could
+ * provide very helpful information in this type of scenarios.
+ *
+ * On software event, it is necessary to stop the hardware branch recorder
+ * fast. Otherwise, the hardware register/buffer will be flushed with
+ * entries of the triggering event. Therefore, static call is used to
+ * stop the hardware recorder.
+ */
+
+/*
+ * cnt is the number of entries allocated for entries.
+ * Return number of entries copied to .
+ */
+typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries,
+					   unsigned int cnt);
+DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
+
 #endif /* _LINUX_PERF_EVENT_H */
-- 
cgit v1.2.3


From efeff6b39b9de4480572c7b0c5eb77204795cb57 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 5 Aug 2021 13:28:24 -0700
Subject: rcutorture: Warn on individual rcu_torture_init() error conditions

When running rcutorture as a module, any rcu_torture_init() issues will be
reflected in the error code from modprobe or insmod, as the case may be.
However, these error codes are not available when running rcutorture
built-in, for example, when using the kvm.sh script.  This commit
therefore adds WARN_ON_ONCE() to allow distinguishing rcu_torture_init()
errors when running rcutorture built-in.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/torture.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 0910c5803f35..24f58e50a94b 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -47,6 +47,14 @@ do {										\
 } while (0)
 void verbose_torout_sleep(void);
 
+#define torture_init_error(firsterr)						\
+({										\
+	int ___firsterr = (firsterr);						\
+										\
+	WARN_ONCE(!IS_MODULE(CONFIG_RCU_TORTURE_TEST) && ___firsterr < 0, "Torture-test initialization failed with error code %d\n", ___firsterr); \
+	___firsterr < 0;								\
+})
+
 /* Definitions for online/offline exerciser. */
 #ifdef CONFIG_HOTPLUG_CPU
 int torture_num_online_cpus(void);
-- 
cgit v1.2.3


From f4c87dbbef2638f6da6e29b5e998e3b1dcdb08ee Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 9 Aug 2021 13:25:13 +0200
Subject: kcsan: Save instruction pointer for scoped accesses

Save the instruction pointer for scoped accesses, so that it becomes
possible for the reporting code to construct more accurate stack traces
that will show the start of the scope.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/kcsan-checks.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h
index 9fd0ad80fef6..5f5965246877 100644
--- a/include/linux/kcsan-checks.h
+++ b/include/linux/kcsan-checks.h
@@ -100,9 +100,12 @@ void kcsan_set_access_mask(unsigned long mask);
 /* Scoped access information. */
 struct kcsan_scoped_access {
 	struct list_head list;
+	/* Access information. */
 	const volatile void *ptr;
 	size_t size;
 	int type;
+	/* Location where scoped access was set up. */
+	unsigned long ip;
 };
 /*
  * Automatically call kcsan_end_scoped_access() when kcsan_scoped_access goes
-- 
cgit v1.2.3


From 8988bacd6045adf39719e5057e43170f83bd1709 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 31 Aug 2021 17:30:44 +0800
Subject: kobject: unexport kobject_create() in kobject.h

The function kobject_create() is only used by one caller,
kobject_create_and_add(), no other driver uses it, nor is exported to
other modules.

However it's still exported in kobject.h, and can sometimes confuse
users of kobject.h.

Since all users should call kobject_create_and_add(), or if extra
attributes are needed, should alloc the memory manually then call
kobject_init_and_add().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Link: https://lore.kernel.org/r/20210831093044.110729-1-wqu@suse.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kobject.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index ea30529fba08..efd56f990a46 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -101,7 +101,6 @@ int kobject_init_and_add(struct kobject *kobj,
 
 extern void kobject_del(struct kobject *kobj);
 
-extern struct kobject * __must_check kobject_create(void);
 extern struct kobject * __must_check kobject_create_and_add(const char *name,
 						struct kobject *parent);
 
-- 
cgit v1.2.3


From 82bcb7fb649844a561ff1ac2e2ace4252bdff793 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <aardelean@deviqon.com>
Date: Mon, 23 Aug 2021 14:22:01 +0300
Subject: iio: st_sensors: remove st_sensors_deallocate_trigger() function

This change converts the st_sensors_allocate_trigger() to use
device-managed functions.

The parent device of the IIO device object is used. This is based on the
assumption that all other devm_ calls in the ST sensors use this reference.

That makes the st_sensors_deallocate_trigger() function un-needed, so it
can be removed.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Alexandru Ardelean <aardelean@deviqon.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210823112204.243255-3-aardelean@deviqon.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/common/st_sensors.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 8bdbaf3f3796..e74b55244f35 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -273,7 +273,6 @@ irqreturn_t st_sensors_trigger_handler(int irq, void *p);
 int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 				const struct iio_trigger_ops *trigger_ops);
 
-void st_sensors_deallocate_trigger(struct iio_dev *indio_dev);
 int st_sensors_validate_device(struct iio_trigger *trig,
 			       struct iio_dev *indio_dev);
 #else
@@ -282,10 +281,6 @@ static inline int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 {
 	return 0;
 }
-static inline void st_sensors_deallocate_trigger(struct iio_dev *indio_dev)
-{
-	return;
-}
 #define st_sensors_validate_device NULL
 #endif
 
-- 
cgit v1.2.3


From 5363c6c17b1014f696bf0b2984af4b694062ce8f Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <aardelean@deviqon.com>
Date: Mon, 23 Aug 2021 14:22:02 +0300
Subject: iio: st_sensors: remove st_sensors_power_disable() function

This change converts the st_sensors_power_enable() function to use
devm_add_action_or_reset() handlers to register regulator_disable hooks for
when the drivers get unloaded.

The parent device of the IIO device object is used. This is based on the
assumption that all other devm_ calls in the ST sensors use this reference.

This makes the st_sensors_power_disable() un-needed.
Removing this also changes unload order a bit, as all ST drivers would call
st_sensors_power_disable() first and iio_device_unregister() after that.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Alexandru Ardelean <aardelean@deviqon.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210823112204.243255-4-aardelean@deviqon.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/common/st_sensors.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index e74b55244f35..fc90c202d15e 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -293,8 +293,6 @@ int st_sensors_set_axis_enable(struct iio_dev *indio_dev, u8 axis_enable);
 
 int st_sensors_power_enable(struct iio_dev *indio_dev);
 
-void st_sensors_power_disable(struct iio_dev *indio_dev);
-
 int st_sensors_debugfs_reg_access(struct iio_dev *indio_dev,
 				  unsigned reg, unsigned writeval,
 				  unsigned *readval);
-- 
cgit v1.2.3


From 6b658c31bb6bc033f6ae60141c676383fb78784c Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <aardelean@deviqon.com>
Date: Mon, 23 Aug 2021 14:22:03 +0300
Subject: iio: st_sensors: remove all driver remove functions

At this point all ST driver remove functions do iio_device_unregister().
This change removes them from them and replaces all iio_device_register()
with devm_iio_device_register().

This can be done in a single change relatively easy, since all these remove
functions are define in st_sensors.h.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Alexandru Ardelean <aardelean@deviqon.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210823112204.243255-5-aardelean@deviqon.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/common/st_sensors.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index fc90c202d15e..d17ae1e5ca19 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -323,21 +323,17 @@ void st_sensors_dev_name_probe(struct device *dev, char *name, int len);
 /* Accelerometer */
 const struct st_sensor_settings *st_accel_get_settings(const char *name);
 int st_accel_common_probe(struct iio_dev *indio_dev);
-void st_accel_common_remove(struct iio_dev *indio_dev);
 
 /* Gyroscope */
 const struct st_sensor_settings *st_gyro_get_settings(const char *name);
 int st_gyro_common_probe(struct iio_dev *indio_dev);
-void st_gyro_common_remove(struct iio_dev *indio_dev);
 
 /* Magnetometer */
 const struct st_sensor_settings *st_magn_get_settings(const char *name);
 int st_magn_common_probe(struct iio_dev *indio_dev);
-void st_magn_common_remove(struct iio_dev *indio_dev);
 
 /* Pressure */
 const struct st_sensor_settings *st_press_get_settings(const char *name);
 int st_press_common_probe(struct iio_dev *indio_dev);
-void st_press_common_remove(struct iio_dev *indio_dev);
 
 #endif /* ST_SENSORS_H */
-- 
cgit v1.2.3


From e42696515414a15774c80f1d454194ce0cd9f145 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <aardelean@deviqon.com>
Date: Mon, 23 Aug 2021 14:22:04 +0300
Subject: iio: st_sensors: remove reference to parent device object on
 st_sensor_data

The idea behind it, is that all devm_ calls in ST sensors are bound to the
parent device object.

However, the reference to that object is kept on both the st_sensor_data
struct and the IIO object parent (indio_dev->dev.parent).

This change only adds a bit consistency and uses the reference stored on
indio_dev->dev.parent, to enforce the assumption that all ST sensors' devm_
calls are bound to the same reference as the one store on st_sensor_data.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Alexandru Ardelean <aardelean@deviqon.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210823112204.243255-6-aardelean@deviqon.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/common/st_sensors.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index d17ae1e5ca19..22f67845cdd3 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -220,7 +220,6 @@ struct st_sensor_settings {
 
 /**
  * struct st_sensor_data - ST sensor device status
- * @dev: Pointer to instance of struct device (I2C or SPI).
  * @trig: The trigger in use by the core driver.
  * @mount_matrix: The mounting matrix of the sensor.
  * @sensor_settings: Pointer to the specific sensor settings in use.
@@ -240,7 +239,6 @@ struct st_sensor_settings {
  * @buffer_data: Data used by buffer part.
  */
 struct st_sensor_data {
-	struct device *dev;
 	struct iio_trigger *trig;
 	struct iio_mount_matrix mount_matrix;
 	struct st_sensor_settings *sensor_settings;
-- 
cgit v1.2.3


From 2935662449dfa4467d2e769a70801c608fd510c3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Sep 2021 07:41:10 +0200
Subject: kernfs: remove kernfs_create_file and kernfs_create_file_ns

All callers actually use __kernfs_create_file.

Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210913054121.616001-3-hch@lst.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kernfs.h | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 1093abf7c28c..cecfeedb7361 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -568,30 +568,6 @@ kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode,
 				    priv, NULL);
 }
 
-static inline struct kernfs_node *
-kernfs_create_file_ns(struct kernfs_node *parent, const char *name,
-		      umode_t mode, kuid_t uid, kgid_t gid,
-		      loff_t size, const struct kernfs_ops *ops,
-		      void *priv, const void *ns)
-{
-	struct lock_class_key *key = NULL;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	key = (struct lock_class_key *)&ops->lockdep_key;
-#endif
-	return __kernfs_create_file(parent, name, mode, uid, gid,
-				    size, ops, priv, ns, key);
-}
-
-static inline struct kernfs_node *
-kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode,
-		   loff_t size, const struct kernfs_ops *ops, void *priv)
-{
-	return kernfs_create_file_ns(parent, name, mode,
-				     GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
-				     size, ops, priv, NULL);
-}
-
 static inline int kernfs_remove_by_name(struct kernfs_node *parent,
 					const char *name)
 {
-- 
cgit v1.2.3


From eaf501e0d8af691e532b6b9aee511659cf5ee00c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Sep 2021 07:41:11 +0200
Subject: kernfs: remove the unused lockdep_key field in struct kernfs_ops

Not actually used anywhere.

Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210913054121.616001-4-hch@lst.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kernfs.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index cecfeedb7361..3ccce6f24548 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -269,10 +269,6 @@ struct kernfs_ops {
 			 struct poll_table_struct *pt);
 
 	int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lock_class_key	lockdep_key;
-#endif
 };
 
 /*
-- 
cgit v1.2.3


From a2aec2c86ef0cad0fd6be718cfeb5cf5eefbfca9 Mon Sep 17 00:00:00 2001
From: "GONG, Ruiqi" <gongruiqi1@huawei.com>
Date: Mon, 30 Aug 2021 16:33:56 +0800
Subject: mtd: Remove obsolete macros only used by the old nand_ecclayout
 struct

All uses of MTD_MAX_{OOBFREE,ECCPOS}_ENTRIES_LARGE have been removed as
commit ef5eeea6e911 ("mtd: nand: brcm: switch to mtd_ooblayout_ops") and
commit aab616e31d1c ("mtd: kill the nand_ecclayout struct") replaced
struct nand_ecclayout by the new mtd_ooblayout_ops interface. Remove
these two macros therefore.

Reported-by: Yi Yang <yiyang13@huawei.com>
Signed-off-by: GONG, Ruiqi <gongruiqi1@huawei.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210830083356.31702-1-gongruiqi1@huawei.com
---
 include/linux/mtd/mtd.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 88227044fc86..f5e7dfc2e4e9 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -72,8 +72,6 @@ struct mtd_oob_ops {
 	uint8_t		*oobbuf;
 };
 
-#define MTD_MAX_OOBFREE_ENTRIES_LARGE	32
-#define MTD_MAX_ECCPOS_ENTRIES_LARGE	640
 /**
  * struct mtd_oob_region - oob region definition
  * @offset: region offset
-- 
cgit v1.2.3


From 67f1e027c27054e641584655020a417eaac9cb3a Mon Sep 17 00:00:00 2001
From: Lukas Prediger <lumip@lumip.de>
Date: Tue, 14 Sep 2021 00:09:42 +0100
Subject: drivers/cdrom: improved ioctl for media change detection

The current implementation of the CDROM_MEDIA_CHANGED ioctl relies on
global state, meaning that only one process can detect a disc change
while the ioctl call will return 0 for other calling processes afterwards
(see bug 213267).

This introduces a new cdrom ioctl, CDROM_TIMED_MEDIA_CHANGE, that
works by maintaining a timestamp of the last detected disc change instead
of a boolean flag: Processes calling this ioctl command can provide
a timestamp of the last disc change known to them and receive
an indication whether the disc was changed since then and the updated
timestamp.

I considered fixing the buggy behavior in the original
CDROM_MEDIA_CHANGED ioctl but that would require maintaining state
for each calling process in the kernel, which seems like a worse
solution than introducing this new ioctl.

Signed-off-by: Lukas Prediger <lumip@lumip.de>
Link: https://lore.kernel.org/all/20210912191207.74449-1-lumip@lumip.de
Signed-off-by: Phillip Potter <phil@philpotter.co.uk>
Link: https://lore.kernel.org/r/20210913230942.1188-1-phil@philpotter.co.uk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/cdrom.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index c4fef00abdf3..0a89f111e00e 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -64,6 +64,7 @@ struct cdrom_device_info {
 	int for_data;
 	int (*exit)(struct cdrom_device_info *);
 	int mrw_mode_page;
+	__s64 last_media_change_ms;
 };
 
 struct cdrom_device_ops {
-- 
cgit v1.2.3


From e25b694bf1d9ef4a3f36c0b85348f8e780f22139 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 24 Jun 2021 11:41:05 +0200
Subject: x86: Always inline context_tracking_guest_enter()

Yes, it really did out-of-line this....

vmlinux.o: warning: objtool: vmx_vcpu_enter_exit()+0x31: call to context_tracking_guest_enter() leaves .noinstr.text section

000000000019f660 <context_tracking_guest_enter>:
  19f660:	e8 00 00 00 00       	callq  19f665 <context_tracking_guest_enter+0x5>	19f661: R_X86_64_PLT32	__sanitizer_cov_trace_pc-0x4
  19f665:	31 c0                	xor    %eax,%eax
  19f667:	c3                   	retq

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210624095148.003928226@infradead.org
---
 include/linux/context_tracking.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 4d7fced3a39f..7a14807c9d1a 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -105,7 +105,7 @@ static inline void user_exit_irqoff(void) { }
 static inline enum ctx_state exception_enter(void) { return 0; }
 static inline void exception_exit(enum ctx_state prev_ctx) { }
 static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
-static inline bool context_tracking_guest_enter(void) { return false; }
+static __always_inline bool context_tracking_guest_enter(void) { return false; }
 static inline void context_tracking_guest_exit(void) { }
 
 #endif /* !CONFIG_CONTEXT_TRACKING */
-- 
cgit v1.2.3


From a5c071ccfa1728508f31e61213ee795e4529d0d4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 28 Jul 2021 12:28:27 -0700
Subject: rcu-tasks: Remove second argument of rcu_read_unlock_trace_special()

The second argument of rcu_read_unlock_trace_special() is always zero.
When called from exit_tasks_rcu_finish_trace(), it is the constant
zero, and rcu_read_unlock_trace_special() doesn't get called from
rcu_read_unlock_trace() unless the value of local variable "nesting"
is zero because in that case the early return is taken instead.

This commit therefore removes the "nesting" argument from the
rcu_read_unlock_trace_special() function, substituting the constant
zero within that function.  This commit also adds a WARN_ON_ONCE()
to rcu_read_lock_trace_held() in case non-zeroness some day appears.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate_trace.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h
index 86c8f6c98412..6f9c35817398 100644
--- a/include/linux/rcupdate_trace.h
+++ b/include/linux/rcupdate_trace.h
@@ -31,7 +31,7 @@ static inline int rcu_read_lock_trace_held(void)
 
 #ifdef CONFIG_TASKS_TRACE_RCU
 
-void rcu_read_unlock_trace_special(struct task_struct *t, int nesting);
+void rcu_read_unlock_trace_special(struct task_struct *t);
 
 /**
  * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section
@@ -80,7 +80,8 @@ static inline void rcu_read_unlock_trace(void)
 		WRITE_ONCE(t->trc_reader_nesting, nesting);
 		return;  // We assume shallow reader nesting.
 	}
-	rcu_read_unlock_trace_special(t, nesting);
+	WARN_ON_ONCE(nesting != 0);
+	rcu_read_unlock_trace_special(t);
 }
 
 void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
-- 
cgit v1.2.3


From 925da92ba5cb0c82d07cdd5049a07e40f54e9c44 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 26 Aug 2021 22:21:22 -0400
Subject: rcu: Avoid unneeded function call in rcu_read_unlock()

Since commit aa40c138cc8f3 ("rcu: Report QS for outermost PREEMPT=n
rcu_read_unlock() for strict GPs") the function rcu_read_unlock_strict()
is invoked by the inlined rcu_read_unlock() function.  However,
rcu_read_unlock_strict() is an empty function in production kernels,
which are built with CONFIG_RCU_STRICT_GRACE_PERIOD=n.

There is a mention of rcu_read_unlock_strict() in the BPF verifier,
but this is in a deny-list, meaning that BPF does not care whether
rcu_read_unlock_strict() is ever called.

This commit therefore provides a slight performance improvement
by hoisting the check of CONFIG_RCU_STRICT_GRACE_PERIOD from
rcu_read_unlock_strict() into rcu_read_unlock(), thus avoiding the
pointless call to an empty function.

Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 434d12fe2d4f..5e0beb5c5659 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -71,7 +71,8 @@ static inline void __rcu_read_lock(void)
 static inline void __rcu_read_unlock(void)
 {
 	preempt_enable();
-	rcu_read_unlock_strict();
+	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+		rcu_read_unlock_strict();
 }
 
 static inline int rcu_preempt_depth(void)
-- 
cgit v1.2.3


From 8dc84dcd7f74b50f81de3dbf6f6b5b146e3a8eea Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 16 Sep 2021 14:27:41 -0700
Subject: net: phy: broadcom: Enable 10BaseT DAC early wake

Enable the DAC early wake when then link operates at 10BaseT allows
power savings in the hundreds of milli Watts by shutting down the
transmitter. A number of errata have been issued for various Gigabit
PHYs and the recommendation is to enable both the early and forced DAC
wake to be on the safe side. This needs to be done dynamically based
upon the link state, which is why a link_change_notify callback is
utilized.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20210916212742.1653088-1-f.fainelli@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/brcmphy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index c2c2147dfeb8..3308cebe1c19 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -233,6 +233,7 @@
 #define MII_BCM54XX_EXP_EXP08			0x0F08
 #define  MII_BCM54XX_EXP_EXP08_RJCT_2MHZ	0x0001
 #define  MII_BCM54XX_EXP_EXP08_EARLY_DAC_WAKE	0x0200
+#define  MII_BCM54XX_EXP_EXP08_FORCE_DAC_WAKE	0x0100
 #define MII_BCM54XX_EXP_EXP75			0x0f75
 #define  MII_BCM54XX_EXP_EXP75_VDACCTRL		0x003c
 #define  MII_BCM54XX_EXP_EXP75_CM_OSC		0x0001
-- 
cgit v1.2.3


From 12235da8c80a1f9909008e4ca6036d5772b81192 Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Thu, 9 Sep 2021 11:32:18 +0200
Subject: kernel/locking: Add context to ww_mutex_trylock()

i915 will soon gain an eviction path that trylock a whole lot of locks
for eviction, getting dmesg failures like below:

  BUG: MAX_LOCK_DEPTH too low!
  turning off the locking correctness validator.
  depth: 48  max: 48!
  48 locks held by i915_selftest/5776:
   #0: ffff888101a79240 (&dev->mutex){....}-{3:3}, at: __driver_attach+0x88/0x160
   #1: ffffc900009778c0 (reservation_ww_class_acquire){+.+.}-{0:0}, at: i915_vma_pin.constprop.63+0x39/0x1b0 [i915]
   #2: ffff88800cf74de8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_vma_pin.constprop.63+0x5f/0x1b0 [i915]
   #3: ffff88810c7f9e38 (&vm->mutex/1){+.+.}-{3:3}, at: i915_vma_pin_ww+0x1c4/0x9d0 [i915]
   #4: ffff88810bad5768 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915]
   #5: ffff88810bad60e8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915]
  ...
   #46: ffff88811964d768 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915]
   #47: ffff88811964e0e8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915]
  INFO: lockdep is turned off.

Fixing eviction to nest into ww_class_acquire is a high priority, but
it requires a rework of the entire driver, which can only be done one
step at a time.

As an intermediate solution, add an acquire context to
ww_mutex_trylock, which allows us to do proper nesting annotations on
the trylocks, making the above lockdep splat disappear.

This is also useful in regulator_lock_nested, which may avoid dropping
regulator_nesting_mutex in the uncontended path, so use it there.

TTM may be another user for this, where we could lock a buffer in a
fastpath with list locks held, without dropping all locks we hold.

[peterz: rework actual ww_mutex_trylock() implementations]
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/YUBGPdDDjKlxAuXJ@hirez.programming.kicks-ass.net
---
 include/linux/dma-resv.h |  2 +-
 include/linux/ww_mutex.h | 15 ++-------------
 2 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index e1ca2080a1ff..39fefb86780b 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -173,7 +173,7 @@ static inline int dma_resv_lock_slow_interruptible(struct dma_resv *obj,
  */
 static inline bool __must_check dma_resv_trylock(struct dma_resv *obj)
 {
-	return ww_mutex_trylock(&obj->lock);
+	return ww_mutex_trylock(&obj->lock, NULL);
 }
 
 /**
diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 29db736af86d..bb763085479a 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -28,12 +28,10 @@
 #ifndef CONFIG_PREEMPT_RT
 #define WW_MUTEX_BASE			mutex
 #define ww_mutex_base_init(l,n,k)	__mutex_init(l,n,k)
-#define ww_mutex_base_trylock(l)	mutex_trylock(l)
 #define ww_mutex_base_is_locked(b)	mutex_is_locked((b))
 #else
 #define WW_MUTEX_BASE			rt_mutex
 #define ww_mutex_base_init(l,n,k)	__rt_mutex_init(l,n,k)
-#define ww_mutex_base_trylock(l)	rt_mutex_trylock(l)
 #define ww_mutex_base_is_locked(b)	rt_mutex_base_is_locked(&(b)->rtmutex)
 #endif
 
@@ -339,17 +337,8 @@ ww_mutex_lock_slow_interruptible(struct ww_mutex *lock,
 
 extern void ww_mutex_unlock(struct ww_mutex *lock);
 
-/**
- * ww_mutex_trylock - tries to acquire the w/w mutex without acquire context
- * @lock: mutex to lock
- *
- * Trylocks a mutex without acquire context, so no deadlock detection is
- * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
- */
-static inline int __must_check ww_mutex_trylock(struct ww_mutex *lock)
-{
-	return ww_mutex_base_trylock(&lock->base);
-}
+extern int __must_check ww_mutex_trylock(struct ww_mutex *lock,
+					 struct ww_acquire_ctx *ctx);
 
 /***
  * ww_mutex_destroy - mark a w/w mutex unusable
-- 
cgit v1.2.3


From a2e05ddda11b0bd529f443df9089ab498b2c2642 Mon Sep 17 00:00:00 2001
From: Zhouyi Zhou <zhouzhouyi@gmail.com>
Date: Wed, 11 Aug 2021 10:59:20 +0800
Subject: lockdep: Improve comments in wait-type checks

Comments in wait-type checks be improved by mentioning the
PREEPT_RT kernel configure option.

Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Link: https://lkml.kernel.org/r/20210811025920.20751-1-zhouzhouyi@gmail.com
---
 include/linux/lockdep_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 3e726ace5c62..d22430840b53 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -21,7 +21,7 @@ enum lockdep_wait_type {
 	LD_WAIT_SPIN,		/* spin loops, raw_spinlock_t etc.. */
 
 #ifdef CONFIG_PROVE_RAW_LOCK_NESTING
-	LD_WAIT_CONFIG,		/* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */
+	LD_WAIT_CONFIG,		/* preemptible in PREEMPT_RT, spinlock_t etc.. */
 #else
 	LD_WAIT_CONFIG = LD_WAIT_SPIN,
 #endif
-- 
cgit v1.2.3


From f7427ba5ce9c5438ad392b6cbcc4ca8a0487d7e7 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Wed, 25 Aug 2021 15:07:04 +0800
Subject: locking/lockdep: Cleanup the repeated declaration

'struct task_struct' has been decleared twice, so keep the top one and
cleanup the repeated one.

Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1629875224-32751-1-git-send-email-zhangshaokun@hisilicon.com
---
 include/linux/debug_locks.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 3f49e65169c6..dbb409d77d4f 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -47,8 +47,6 @@ extern int debug_locks_off(void);
 # define locking_selftest()	do { } while (0)
 #endif
 
-struct task_struct;
-
 #ifdef CONFIG_LOCKDEP
 extern void debug_show_all_locks(void);
 extern void debug_show_held_locks(struct task_struct *task);
-- 
cgit v1.2.3


From f68d08c437f98ee19a14142b9de2d7afe2032d5c Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 17 Sep 2021 11:15:50 -0700
Subject: net: phy: bcm7xxx: Add EPHY entry for 72165

72165 is a 16nm process SoC with a 10/100 integrated Ethernet PHY,
create a new macro and set of functions for this different process type.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20210917181551.2836036-1-f.fainelli@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/brcmphy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 3308cebe1c19..07e1dfadbbdf 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -32,6 +32,7 @@
 
 #define PHY_ID_BCM72113			0x35905310
 #define PHY_ID_BCM72116			0x35905350
+#define PHY_ID_BCM72165			0x35905340
 #define PHY_ID_BCM7250			0xae025280
 #define PHY_ID_BCM7255			0xae025120
 #define PHY_ID_BCM7260			0xae025190
-- 
cgit v1.2.3


From 335ff4990cf3bfa42d8846f9b3d8c09456f51801 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Fri, 17 Sep 2021 11:29:03 -0700
Subject: bpf: Merge printk and seq_printf VARARG max macros

MAX_SNPRINTF_VARARGS and MAX_SEQ_PRINTF_VARARGS are used by bpf helpers
bpf_snprintf and bpf_seq_printf to limit their varargs. Both call into
bpf_bprintf_prepare for print formatting logic and have convenience
macros in libbpf (BPF_SNPRINTF, BPF_SEQ_PRINTF) which use the same
helper macros to convert varargs to a byte array.

Changing shared functionality to support more varargs for either bpf
helper would affect the other as well, so let's combine the _VARARGS
macros to make this more obvious.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210917182911.2426606-2-davemarchevsky@fb.com
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f4c16f19f83e..be8d57e6e78a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2216,6 +2216,8 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 struct btf_id_set;
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
 
+#define MAX_BPRINTF_VARARGS		12
+
 int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
 			u32 **bin_buf, u32 num_args);
 void bpf_bprintf_cleanup(void);
-- 
cgit v1.2.3


From 10aceb629e198429c849d5e995c3bb1ba7a9aaa3 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Fri, 17 Sep 2021 11:29:05 -0700
Subject: bpf: Add bpf_trace_vprintk helper

This helper is meant to be "bpf_trace_printk, but with proper vararg
support". Follow bpf_snprintf's example and take a u64 pseudo-vararg
array. Write to /sys/kernel/debug/tracing/trace_pipe using the same
mechanism as bpf_trace_printk. The functionality of this helper was
requested in the libbpf issue tracker [0].

[0] Closes: https://github.com/libbpf/libbpf/issues/315

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210917182911.2426606-4-davemarchevsky@fb.com
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index be8d57e6e78a..b6c45a6cbbba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1088,6 +1088,7 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f
 int bpf_prog_calc_tag(struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
+const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void);
 
 typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
 					unsigned long off, unsigned long len);
-- 
cgit v1.2.3


From 55c42fa7fa331f98062c32799456420930b8bf8c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 17 Sep 2021 16:33:19 -0700
Subject: mptcp: add MPTCP_INFO getsockopt

Its not compatible with multipath-tcp.org kernel one.

1. The out-of-tree implementation defines a different 'struct mptcp_info',
   with embedded __user addresses for additional data such as
   endpoint addresses.

2. Mat Martineau points out that embedded __user addresses doesn't work
with BPF_CGROUP_RUN_PROG_GETSOCKOPT() which assumes that copying in
optsize bytes from optval provides all data that got copied to userspace.

This provides mptcp_info data for the given mptcp socket.

Userspace sets optlen to the size of the structure it expects.
The kernel updates it to contain the number of bytes that it copied.

This allows to append more information to the structure later.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 041d6032a348..7612d760b6a9 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -364,6 +364,7 @@ struct ucred {
 #define SOL_KCM		281
 #define SOL_TLS		282
 #define SOL_XDP		283
+#define SOL_MPTCP	284
 
 /* IPX options */
 #define IPX_TYPE	1
-- 
cgit v1.2.3


From 5bd2182d58e9d9c6279b7a8a2f9b41add0e7f9cb Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 16 Feb 2021 19:46:48 -0500
Subject: audit,io_uring,io-wq: add some basic audit support to io_uring

This patch adds basic auditing to io_uring operations, regardless of
their context.  This is accomplished by allocating audit_context
structures for the io-wq worker and io_uring SQPOLL kernel threads
as well as explicitly auditing the io_uring operations in
io_issue_sqe().  Individual io_uring operations can bypass auditing
through the "audit_skip" field in the struct io_op_def definition for
the operation; although great care must be taken so that security
relevant io_uring operations do not bypass auditing; please contact
the audit mailing list (see the MAINTAINERS file) with any questions.

The io_uring operations are audited using a new AUDIT_URINGOP record,
an example is shown below:

  type=UNKNOWN[1336] msg=audit(1631800225.981:37289):
    uring_op=19 success=yes exit=0 items=0 ppid=15454 pid=15681
    uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0
    subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
    key=(null)

Thanks to Richard Guy Briggs for review and feedback.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 82b7c1116a85..d656a06dd909 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -286,7 +286,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
 /* These are defined in auditsc.c */
 				/* Public API */
 extern int  audit_alloc(struct task_struct *task);
+extern int  audit_alloc_kernel(struct task_struct *task);
 extern void __audit_free(struct task_struct *task);
+extern void __audit_uring_entry(u8 op);
+extern void __audit_uring_exit(int success, long code);
 extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
 				  unsigned long a2, unsigned long a3);
 extern void __audit_syscall_exit(int ret_success, long ret_value);
@@ -323,6 +326,21 @@ static inline void audit_free(struct task_struct *task)
 	if (unlikely(task->audit_context))
 		__audit_free(task);
 }
+static inline void audit_uring_entry(u8 op)
+{
+	/*
+	 * We intentionally check audit_context() before audit_enabled as most
+	 * Linux systems (as of ~2021) rely on systemd which forces audit to
+	 * be enabled regardless of the user's audit configuration.
+	 */
+	if (unlikely(audit_context() && audit_enabled))
+		__audit_uring_entry(op);
+}
+static inline void audit_uring_exit(int success, long code)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_uring_exit(success, code);
+}
 static inline void audit_syscall_entry(int major, unsigned long a0,
 				       unsigned long a1, unsigned long a2,
 				       unsigned long a3)
@@ -554,8 +572,16 @@ static inline int audit_alloc(struct task_struct *task)
 {
 	return 0;
 }
+static inline int audit_alloc_kernel(struct task_struct *task)
+{
+	return 0;
+}
 static inline void audit_free(struct task_struct *task)
 { }
+static inline void audit_uring_entry(u8 op)
+{ }
+static inline void audit_uring_exit(int success, long code)
+{ }
 static inline void audit_syscall_entry(int major, unsigned long a0,
 				       unsigned long a1, unsigned long a2,
 				       unsigned long a3)
-- 
cgit v1.2.3


From 3a862cacf8670729b1ad8fc28e4f7e078f9c570c Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 1 Feb 2021 19:22:44 -0500
Subject: fs: add anon_inode_getfile_secure() similar to
 anon_inode_getfd_secure()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extending the secure anonymous inode support to other subsystems
requires that we have a secure anon_inode_getfile() variant in
addition to the existing secure anon_inode_getfd() variant.

Thankfully we can reuse the existing __anon_inode_getfile() function
and just wrap it with the proper arguments.

Acked-by: Mickaël Salaün <mic@linux.microsoft.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/anon_inodes.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index 71881a2b6f78..5deaddbd7927 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -15,6 +15,10 @@ struct inode;
 struct file *anon_inode_getfile(const char *name,
 				const struct file_operations *fops,
 				void *priv, int flags);
+struct file *anon_inode_getfile_secure(const char *name,
+				       const struct file_operations *fops,
+				       void *priv, int flags,
+				       const struct inode *context_inode);
 int anon_inode_getfd(const char *name, const struct file_operations *fops,
 		     void *priv, int flags);
 int anon_inode_getfd_secure(const char *name,
-- 
cgit v1.2.3


From cdc1404a40461faba23c5a5ad40adcc7eecc1580 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 1 Feb 2021 19:56:49 -0500
Subject: lsm,io_uring: add LSM hooks to io_uring

A full expalantion of io_uring is beyond the scope of this commit
description, but in summary it is an asynchronous I/O mechanism
which allows for I/O requests and the resulting data to be queued
in memory mapped "rings" which are shared between the kernel and
userspace.  Optionally, io_uring offers the ability for applications
to spawn kernel threads to dequeue I/O requests from the ring and
submit the requests in the kernel, helping to minimize the syscall
overhead.  Rings are accessed in userspace by memory mapping a file
descriptor provided by the io_uring_setup(2), and can be shared
between applications as one might do with any open file descriptor.
Finally, process credentials can be registered with a given ring
and any process with access to that ring can submit I/O requests
using any of the registered credentials.

While the io_uring functionality is widely recognized as offering a
vastly improved, and high performing asynchronous I/O mechanism, its
ability to allow processes to submit I/O requests with credentials
other than its own presents a challenge to LSMs.  When a process
creates a new io_uring ring the ring's credentials are inhertied
from the calling process; if this ring is shared with another
process operating with different credentials there is the potential
to bypass the LSMs security policy.  Similarly, registering
credentials with a given ring allows any process with access to that
ring to submit I/O requests with those credentials.

In an effort to allow LSMs to apply security policy to io_uring I/O
operations, this patch adds two new LSM hooks.  These hooks, in
conjunction with the LSM anonymous inode support previously
submitted, allow an LSM to apply access control policy to the
sharing of io_uring rings as well as any io_uring credential changes
requested by a process.

The new LSM hooks are described below:

 * int security_uring_override_creds(cred)
   Controls if the current task, executing an io_uring operation,
   is allowed to override it's credentials with @cred.  In cases
   where the current task is a user application, the current
   credentials will be those of the user application.  In cases
   where the current task is a kernel thread servicing io_uring
   requests the current credentials will be those of the io_uring
   ring (inherited from the process that created the ring).

 * int security_uring_sqpoll(void)
   Controls if the current task is allowed to create an io_uring
   polling thread (IORING_SETUP_SQPOLL).  Without a SQPOLL thread
   in the kernel processes must submit I/O requests via
   io_uring_enter(2) which allows us to compare any requested
   credential changes against the application making the request.
   With a SQPOLL thread, we can no longer compare requested
   credential changes against the application making the request,
   the comparison is made against the ring's credentials.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h |  5 +++++
 include/linux/lsm_hooks.h     | 13 +++++++++++++
 include/linux/security.h      | 16 ++++++++++++++++
 3 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 2adeea44c0d5..b3c525353769 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -402,3 +402,8 @@ LSM_HOOK(void, LSM_RET_VOID, perf_event_free, struct perf_event *event)
 LSM_HOOK(int, 0, perf_event_read, struct perf_event *event)
 LSM_HOOK(int, 0, perf_event_write, struct perf_event *event)
 #endif /* CONFIG_PERF_EVENTS */
+
+#ifdef CONFIG_IO_URING
+LSM_HOOK(int, 0, uring_override_creds, const struct cred *new)
+LSM_HOOK(int, 0, uring_sqpoll, void)
+#endif /* CONFIG_IO_URING */
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 5c4c5c0602cb..0eb0ae95c4c4 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1557,6 +1557,19 @@
  * 	Read perf_event security info if allowed.
  * @perf_event_write:
  * 	Write perf_event security info if allowed.
+ *
+ * Security hooks for io_uring
+ *
+ * @uring_override_creds:
+ *      Check if the current task, executing an io_uring operation, is allowed
+ *      to override it's credentials with @new.
+ *
+ *      @new: the new creds to use
+ *
+ * @uring_sqpoll:
+ *      Check whether the current task is allowed to spawn a io_uring polling
+ *      thread (IORING_SETUP_SQPOLL).
+ *
  */
 union security_list_options {
 	#define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__);
diff --git a/include/linux/security.h b/include/linux/security.h
index 5b7288521300..7979b9629a42 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2038,4 +2038,20 @@ static inline int security_perf_event_write(struct perf_event *event)
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_PERF_EVENTS */
 
+#ifdef CONFIG_IO_URING
+#ifdef CONFIG_SECURITY
+extern int security_uring_override_creds(const struct cred *new);
+extern int security_uring_sqpoll(void);
+#else
+static inline int security_uring_override_creds(const struct cred *new)
+{
+	return 0;
+}
+static inline int security_uring_sqpoll(void)
+{
+	return 0;
+}
+#endif /* CONFIG_SECURITY */
+#endif /* CONFIG_IO_URING */
+
 #endif /* ! __LINUX_SECURITY_H */
-- 
cgit v1.2.3


From f279294b329363eb6ada568e494d609ef78e3e8e Mon Sep 17 00:00:00 2001
From: Chunguang Xu <brookxu@tencent.com>
Date: Fri, 17 Sep 2021 20:44:14 +0800
Subject: misc_cgroup: introduce misc.events to count failures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce misc.events to make it easier for us to understand
the pressure of resources. Currently only the 'max' event is
implemented, which indicates the times the resource is about
to exceeds the max limit.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Reviewed-by: Vipin Sharma <vipinsh@google.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/misc_cgroup.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index da2367e2ac1e..091f2d2a1aec 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -36,6 +36,7 @@ struct misc_cg;
 struct misc_res {
 	unsigned long max;
 	atomic_long_t usage;
+	atomic_long_t events;
 	bool failed;
 };
 
@@ -46,6 +47,10 @@ struct misc_res {
  */
 struct misc_cg {
 	struct cgroup_subsys_state css;
+
+	/* misc.events */
+	struct cgroup_file events_file;
+
 	struct misc_res res[MISC_CG_RES_TYPES];
 };
 
-- 
cgit v1.2.3


From b03357528fd9516600ff358ab5d4b887b8cb80e8 Mon Sep 17 00:00:00 2001
From: Chunguang Xu <brookxu@tencent.com>
Date: Fri, 17 Sep 2021 20:44:15 +0800
Subject: misc_cgroup: remove error log to avoid log flood
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In scenarios where containers are frequently created and deleted,
a large number of error logs maybe generated. The logs only show
which node is about to go over the max limit, not the node which
resource request failed. As misc.events has provided relevant
information, maybe we can remove this log.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/misc_cgroup.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index 091f2d2a1aec..c238207d1615 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -37,7 +37,6 @@ struct misc_res {
 	unsigned long max;
 	atomic_long_t usage;
 	atomic_long_t events;
-	bool failed;
 };
 
 /**
-- 
cgit v1.2.3


From 4373b3dc922038e8924f648506f6556f2afa7e77 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 9 Sep 2021 11:45:13 -0700
Subject: fscrypt: remove fscrypt_operations::max_namelen

The max_namelen field is unnecessary, as it is set to 255 (NAME_MAX) on
all filesystems that support fscrypt (or plan to support fscrypt).  For
simplicity, just use NAME_MAX directly instead.

Link: https://lore.kernel.org/r/20210909184513.139281-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/fscrypt.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index e912ed9141d9..91ea9477e9bd 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -118,9 +118,6 @@ struct fscrypt_operations {
 	 */
 	bool (*empty_dir)(struct inode *inode);
 
-	/* The filesystem's maximum ciphertext filename length, in bytes */
-	unsigned int max_namelen;
-
 	/*
 	 * Check whether the filesystem's inode numbers and UUID are stable,
 	 * meaning that they will never be changed even by offline operations
-- 
cgit v1.2.3


From 9d881361206ebcf6285c2ec2ef275aff80875347 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 21 Sep 2021 12:42:25 +0300
Subject: bus: ti-sysc: Add quirk handling for reinit on context lost

Some interconnect target modules such as otg and gpmc on am335x need a
re-init after resume. As we also have PM runtime cases where the context
may be lost, let's handle these all with cpu_pm.

For the am335x resume path, we already have cpu_pm_resume() call
cpu_pm_cluster_exit().

Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 include/linux/platform_data/ti-sysc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 9837fb011f2f..989aa30c598d 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -50,6 +50,7 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSC_QUIRK_REINIT_ON_CTX_LOST	BIT(28)
 #define SYSC_QUIRK_REINIT_ON_RESUME	BIT(27)
 #define SYSC_QUIRK_GPMC_DEBUG		BIT(26)
 #define SYSC_MODULE_QUIRK_ENA_RESETDONE	BIT(25)
-- 
cgit v1.2.3


From 6a52bc2b81fa06b7ab98472c7d80644e8e071af6 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 21 Sep 2021 12:42:25 +0300
Subject: bus: ti-sysc: Add quirk handling for reset on re-init

At least am335x gpmc module needs a reset in addition to re-init on resume.
Let's add a quirk handling for reset on re-init.

Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 include/linux/platform_data/ti-sysc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 989aa30c598d..f11244f5c0b6 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -50,6 +50,7 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSC_QUIRK_RESET_ON_CTX_LOST	BIT(29)
 #define SYSC_QUIRK_REINIT_ON_CTX_LOST	BIT(28)
 #define SYSC_QUIRK_REINIT_ON_RESUME	BIT(27)
 #define SYSC_QUIRK_GPMC_DEBUG		BIT(26)
-- 
cgit v1.2.3


From 5c99fa737c693a50419606d8f8266974a3521b32 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 21 Sep 2021 12:42:25 +0300
Subject: bus: ti-sysc: Handle otg force idle quirk

Let's add handling the otg force idle quirk for the old omap2430 glue layer
used up to omap4 as the musb driver quirk only works if the driver is
loaded. Unlike with the am335x glue layer, looks like we don't need the
quirk handling for SYSC_QUIRK_REINIT_ON_CTX_LOST.

Eventually when all the musb using SoCs are booting with device tree based
configuration, we can just remove the related quirk handling from the
musb driver.

Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 include/linux/platform_data/ti-sysc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index f11244f5c0b6..eb556f988d57 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -50,6 +50,7 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSC_MODULE_QUIRK_OTG		BIT(30)
 #define SYSC_QUIRK_RESET_ON_CTX_LOST	BIT(29)
 #define SYSC_QUIRK_REINIT_ON_CTX_LOST	BIT(28)
 #define SYSC_QUIRK_REINIT_ON_RESUME	BIT(27)
-- 
cgit v1.2.3


From d6da08ed1425180b8d54c828ec06d247fd915d60 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 20 Sep 2021 14:54:14 -0700
Subject: net: phy: broadcom: Add IDDQ-SR mode

Add support for putting the PHY into IDDQ Soft Recovery mode by setting
the TOP_MISC register bits accordingly. This requires us to implement a
custom bcm54xx_suspend() routine which diverges from genphy_suspend() in
order to configure the PHY to enter IDDQ with software recovery as well
as avoid doing a read/modify/write on the BMCR register.

Doing a read/modify/write on the BMCR register means that the
auto-negotation bit may remain which interferes with the ability to put
the PHY into IDDQ-SR mode. We do software reset upon suspend in order to
put the PHY back into its state prior to suspend as recommended by the
datasheet.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 07e1dfadbbdf..b119d6819d6c 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -67,6 +67,7 @@
 #define PHY_BRCM_CLEAR_RGMII_MODE	0x00000004
 #define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00000008
 #define PHY_BRCM_EN_MASTER_MODE		0x00000010
+#define PHY_BRCM_IDDQ_SUSPEND		0x000000220
 
 /* Broadcom BCM7xxx specific workarounds */
 #define PHY_BRCM_7XXX_REV(x)		(((x) >> 8) & 0xff)
@@ -84,6 +85,7 @@
 
 #define MII_BCM54XX_EXP_DATA	0x15	/* Expansion register data */
 #define MII_BCM54XX_EXP_SEL	0x17	/* Expansion register select */
+#define MII_BCM54XX_EXP_SEL_TOP	0x0d00	/* TOP_MISC expansion register select */
 #define MII_BCM54XX_EXP_SEL_SSD	0x0e00	/* Secondary SerDes select */
 #define MII_BCM54XX_EXP_SEL_ER	0x0f00	/* Expansion register select */
 #define MII_BCM54XX_EXP_SEL_ETC	0x0d00	/* Expansion register spare + 2k mem */
@@ -243,6 +245,12 @@
 #define MII_BCM54XX_EXP_EXP97			0x0f97
 #define  MII_BCM54XX_EXP_EXP97_MYST		0x0c0c
 
+/* Top-MISC expansion registers */
+#define BCM54XX_TOP_MISC_IDDQ_CTRL		(MII_BCM54XX_EXP_SEL_TOP + 0x06)
+#define BCM54XX_TOP_MISC_IDDQ_LP		(1 << 0)
+#define BCM54XX_TOP_MISC_IDDQ_SD		(1 << 2)
+#define BCM54XX_TOP_MISC_IDDQ_SR		(1 << 3)
+
 /*
  * BCM5482: Secondary SerDes registers
  */
-- 
cgit v1.2.3


From 06dc660e6eb8817c4c379d2ca290ae0b3f77c69f Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 14 Sep 2021 01:27:08 +1000
Subject: PCI: Rename pcibios_add_device() to pcibios_device_add()

The general convention for pcibios_* hooks is that they're named after the
corresponding pci_* function they provide a hook for. The exception is
pcibios_add_device() which provides a hook for pci_device_add().

Rename pcibios_add_device() to pcibios_device_add() so it matches
pci_device_add().

Also, remove the export of the microblaze version. The only caller must be
compiled as a built-in so there's no reason for the export.

Link: https://lore.kernel.org/r/20210913152709.48013-1-oohall@gmail.com
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Niklas Schnelle <schnelle@linux.ibm.com>	# s390
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index cd8aa6fce204..7e0ce3a4d5a1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2126,7 +2126,7 @@ void pcibios_disable_device(struct pci_dev *dev);
 void pcibios_set_master(struct pci_dev *dev);
 int pcibios_set_pcie_reset_state(struct pci_dev *dev,
 				 enum pcie_reset_state state);
-int pcibios_add_device(struct pci_dev *dev);
+int pcibios_device_add(struct pci_dev *dev);
 void pcibios_release_device(struct pci_dev *dev);
 #ifdef CONFIG_PCI
 void pcibios_penalize_isa_irq(int irq, int active);
-- 
cgit v1.2.3


From d1c6e08e7503649e4a4f3f9e700e2c05300b6379 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 8 Sep 2021 22:11:37 -0700
Subject: libnvdimm/labels: Add uuid helpers

In preparation for CXL labels that move the uuid to a different offset
in the label, add nsl_{ref,get,validate}_uuid(). These helpers use the
proper uuid_t type. That type definition predated the libnvdimm
subsystem, so now is as a good a time as any to convert all the uuid
handling in the subsystem to uuid_t to match the helpers.

Note that the uuid fields in the label data and superblocks is not
replaced per Andy's expectation that uuid_t is a kernel internal type
not to appear in external ABI interfaces. So, in those case
{import,export}_uuid() is used to go between the 2 types.

Also note that this rework uncovered some unnecessary copies for label
comparisons, those are cleaned up with nsl_uuid_equal().

As for the whitespace changes, all new code is clang-format compliant.

Reported-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/163116429748.2460985.15659993454313919977.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/nd.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nd.h b/include/linux/nd.h
index ee9ad76afbba..8a8c63edb1b2 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -88,7 +88,7 @@ struct nd_namespace_pmem {
 	struct nd_namespace_io nsio;
 	unsigned long lbasize;
 	char *alt_name;
-	u8 *uuid;
+	uuid_t *uuid;
 	int id;
 };
 
@@ -105,7 +105,7 @@ struct nd_namespace_pmem {
 struct nd_namespace_blk {
 	struct nd_namespace_common common;
 	char *alt_name;
-	u8 *uuid;
+	uuid_t *uuid;
 	int id;
 	unsigned long lbasize;
 	resource_size_t size;
-- 
cgit v1.2.3


From 8c75d585b931ac874fbe4ee5a8f1811d20c2817f Mon Sep 17 00:00:00 2001
From: Deepak Kumar Singh <deesin@codeaurora.org>
Date: Tue, 31 Aug 2021 20:00:27 +0530
Subject: soc: qcom: aoss: Expose send for generic usecase

Not all upcoming usecases will have an interface to allow the aoss
driver to hook onto. Expose the send api and create a get function to
enable drivers to send their own messages to aoss.

Signed-off-by: Chris Lew <clew@codeaurora.org>
Signed-off-by: Deepak Kumar Singh <deesin@codeaurora.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/1630420228-31075-2-git-send-email-deesin@codeaurora.org
---
 include/linux/soc/qcom/qcom_aoss.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 include/linux/soc/qcom/qcom_aoss.h

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/qcom_aoss.h b/include/linux/soc/qcom/qcom_aoss.h
new file mode 100644
index 000000000000..3c2a82e606f8
--- /dev/null
+++ b/include/linux/soc/qcom/qcom_aoss.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef __QCOM_AOSS_H__
+#define __QCOM_AOSS_H__
+
+#include <linux/err.h>
+#include <linux/device.h>
+
+struct qmp;
+
+#if IS_ENABLED(CONFIG_QCOM_AOSS_QMP)
+
+int qmp_send(struct qmp *qmp, const void *data, size_t len);
+struct qmp *qmp_get(struct device *dev);
+void qmp_put(struct qmp *qmp);
+
+#else
+
+static inline int qmp_send(struct qmp *qmp, const void *data, size_t len)
+{
+	return -ENODEV;
+}
+
+static inline struct qmp *qmp_get(struct device *dev)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline void qmp_put(struct qmp *qmp)
+{
+}
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From b468e688240b58ece498c430c377bb81b78a41f1 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 14 Sep 2021 11:11:20 +0200
Subject: tty: remove flags from struct tty_ldisc_ops

The last user was apparently removed by commit a352def21a64 (tty: Ldisc
revamp) in 2008. So remove the field completely, the only setter
(n_tty_inherit_ops) and also its only possible value
(LDISC_FLAG_DEFINED).

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210914091134.17426-2-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_ldisc.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index b1d812e902aa..1f37184eee63 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -180,7 +180,6 @@ extern int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
 struct tty_ldisc_ops {
 	char	*name;
 	int	num;
-	int	flags;
 
 	/*
 	 * The following routines are called from above.
@@ -220,8 +219,6 @@ struct tty_ldisc {
 	struct tty_struct *tty;
 };
 
-#define LDISC_FLAG_DEFINED	0x00000001
-
 #define MODULE_ALIAS_LDISC(ldisc) \
 	MODULE_ALIAS("tty-ldisc-" __stringify(ldisc))
 
-- 
cgit v1.2.3


From 7894193436b65f304ba790abe2532a7a3bbeb7c8 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 14 Sep 2021 11:11:21 +0200
Subject: tty: remove extern from functions in tty headers

After the recent headers cleanup, some function declarations still have
extern before them. It is superfluous (for function declarations), so
remove extern from those which still have it.

This unifies them with the rest of the files.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210914091134.17426-3-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h        | 139 ++++++++++++++++++++++-----------------------
 include/linux/tty_driver.h |   8 +--
 include/linux/tty_flip.h   |  20 +++----
 include/linux/tty_ldisc.h  |  22 +++----
 4 files changed, 94 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 168e57e40bbb..15453b0c4081 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -252,20 +252,20 @@ static inline bool tty_throttled(struct tty_struct *tty)
 }
 
 #ifdef CONFIG_TTY
-extern void tty_kref_put(struct tty_struct *tty);
-extern struct pid *tty_get_pgrp(struct tty_struct *tty);
-extern void tty_vhangup_self(void);
-extern void disassociate_ctty(int priv);
-extern dev_t tty_devnum(struct tty_struct *tty);
-extern void proc_clear_tty(struct task_struct *p);
-extern struct tty_struct *get_current_tty(void);
+void tty_kref_put(struct tty_struct *tty);
+struct pid *tty_get_pgrp(struct tty_struct *tty);
+void tty_vhangup_self(void);
+void disassociate_ctty(int priv);
+dev_t tty_devnum(struct tty_struct *tty);
+void proc_clear_tty(struct task_struct *p);
+struct tty_struct *get_current_tty(void);
 /* tty_io.c */
-extern int __init tty_init(void);
-extern const char *tty_name(const struct tty_struct *tty);
-extern struct tty_struct *tty_kopen_exclusive(dev_t device);
-extern struct tty_struct *tty_kopen_shared(dev_t device);
-extern void tty_kclose(struct tty_struct *tty);
-extern int tty_dev_name_to_number(const char *name, dev_t *number);
+int __init tty_init(void);
+const char *tty_name(const struct tty_struct *tty);
+struct tty_struct *tty_kopen_exclusive(dev_t device);
+struct tty_struct *tty_kopen_shared(dev_t device);
+void tty_kclose(struct tty_struct *tty);
+int tty_dev_name_to_number(const char *name, dev_t *number);
 #else
 static inline void tty_kref_put(struct tty_struct *tty)
 { }
@@ -296,7 +296,7 @@ static inline int tty_dev_name_to_number(const char *name, dev_t *number)
 
 extern struct ktermios tty_std_termios;
 
-extern int vcs_init(void);
+int vcs_init(void);
 
 extern struct class *tty_class;
 
@@ -316,34 +316,34 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 	return tty;
 }
 
-extern const char *tty_driver_name(const struct tty_struct *tty);
-extern void tty_wait_until_sent(struct tty_struct *tty, long timeout);
-extern void stop_tty(struct tty_struct *tty);
-extern void start_tty(struct tty_struct *tty);
-extern void tty_write_message(struct tty_struct *tty, char *msg);
-extern int tty_send_xchar(struct tty_struct *tty, char ch);
-extern int tty_put_char(struct tty_struct *tty, unsigned char c);
-extern unsigned int tty_chars_in_buffer(struct tty_struct *tty);
-extern unsigned int tty_write_room(struct tty_struct *tty);
-extern void tty_driver_flush_buffer(struct tty_struct *tty);
-extern void tty_unthrottle(struct tty_struct *tty);
-extern int tty_throttle_safe(struct tty_struct *tty);
-extern int tty_unthrottle_safe(struct tty_struct *tty);
-extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
-extern int tty_get_icount(struct tty_struct *tty,
-			  struct serial_icounter_struct *icount);
-extern int is_current_pgrp_orphaned(void);
-extern void tty_hangup(struct tty_struct *tty);
-extern void tty_vhangup(struct tty_struct *tty);
-extern int tty_hung_up_p(struct file *filp);
-extern void do_SAK(struct tty_struct *tty);
-extern void __do_SAK(struct tty_struct *tty);
-extern void no_tty(void);
-extern speed_t tty_termios_baud_rate(struct ktermios *termios);
-extern void tty_termios_encode_baud_rate(struct ktermios *termios,
-						speed_t ibaud, speed_t obaud);
-extern void tty_encode_baud_rate(struct tty_struct *tty,
-						speed_t ibaud, speed_t obaud);
+const char *tty_driver_name(const struct tty_struct *tty);
+void tty_wait_until_sent(struct tty_struct *tty, long timeout);
+void stop_tty(struct tty_struct *tty);
+void start_tty(struct tty_struct *tty);
+void tty_write_message(struct tty_struct *tty, char *msg);
+int tty_send_xchar(struct tty_struct *tty, char ch);
+int tty_put_char(struct tty_struct *tty, unsigned char c);
+unsigned int tty_chars_in_buffer(struct tty_struct *tty);
+unsigned int tty_write_room(struct tty_struct *tty);
+void tty_driver_flush_buffer(struct tty_struct *tty);
+void tty_unthrottle(struct tty_struct *tty);
+int tty_throttle_safe(struct tty_struct *tty);
+int tty_unthrottle_safe(struct tty_struct *tty);
+int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
+int tty_get_icount(struct tty_struct *tty,
+		struct serial_icounter_struct *icount);
+int is_current_pgrp_orphaned(void);
+void tty_hangup(struct tty_struct *tty);
+void tty_vhangup(struct tty_struct *tty);
+int tty_hung_up_p(struct file *filp);
+void do_SAK(struct tty_struct *tty);
+void __do_SAK(struct tty_struct *tty);
+void no_tty(void);
+speed_t tty_termios_baud_rate(struct ktermios *termios);
+void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud,
+		speed_t obaud);
+void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud,
+		speed_t obaud);
 
 /**
  *	tty_get_baud_rate	-	get tty bit rates
@@ -363,37 +363,37 @@ static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
 unsigned char tty_get_char_size(unsigned int cflag);
 unsigned char tty_get_frame_size(unsigned int cflag);
 
-extern void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old);
-extern int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b);
-extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);
+void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old);
+int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b);
+int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);
 
-extern void tty_wakeup(struct tty_struct *tty);
+void tty_wakeup(struct tty_struct *tty);
 
-extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
+int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 			unsigned int cmd, unsigned long arg);
-extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg);
-extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx);
-extern void tty_release_struct(struct tty_struct *tty, int idx);
-extern void tty_init_termios(struct tty_struct *tty);
-extern void tty_save_termios(struct tty_struct *tty);
-extern int tty_standard_install(struct tty_driver *driver,
+int tty_perform_flush(struct tty_struct *tty, unsigned long arg);
+struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx);
+void tty_release_struct(struct tty_struct *tty, int idx);
+void tty_init_termios(struct tty_struct *tty);
+void tty_save_termios(struct tty_struct *tty);
+int tty_standard_install(struct tty_driver *driver,
 		struct tty_struct *tty);
 
 extern struct mutex tty_mutex;
 
 /* n_tty.c */
-extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops);
+void n_tty_inherit_ops(struct tty_ldisc_ops *ops);
 #ifdef CONFIG_TTY
-extern void __init n_tty_init(void);
+void __init n_tty_init(void);
 #else
 static inline void n_tty_init(void) { }
 #endif
 
 /* tty_audit.c */
 #ifdef CONFIG_AUDIT
-extern void tty_audit_exit(void);
-extern void tty_audit_fork(struct signal_struct *sig);
-extern int tty_audit_push(void);
+void tty_audit_exit(void);
+void tty_audit_fork(struct signal_struct *sig);
+int tty_audit_push(void);
 #else
 static inline void tty_audit_exit(void)
 {
@@ -408,24 +408,23 @@ static inline int tty_audit_push(void)
 #endif
 
 /* tty_ioctl.c */
-extern int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file,
-		       unsigned int cmd, unsigned long arg);
+int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file,
+		unsigned int cmd, unsigned long arg);
 
 /* vt.c */
 
-extern int vt_ioctl(struct tty_struct *tty,
-		    unsigned int cmd, unsigned long arg);
+int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg);
 
-extern long vt_compat_ioctl(struct tty_struct *tty,
-		     unsigned int cmd, unsigned long arg);
+long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd,
+		unsigned long arg);
 
 /* tty_mutex.c */
 /* functions for preparation of BKL removal */
-extern void tty_lock(struct tty_struct *tty);
-extern int  tty_lock_interruptible(struct tty_struct *tty);
-extern void tty_unlock(struct tty_struct *tty);
-extern void tty_lock_slave(struct tty_struct *tty);
-extern void tty_unlock_slave(struct tty_struct *tty);
-extern void tty_set_lock_subclass(struct tty_struct *tty);
+void tty_lock(struct tty_struct *tty);
+int  tty_lock_interruptible(struct tty_struct *tty);
+void tty_unlock(struct tty_struct *tty);
+void tty_lock_slave(struct tty_struct *tty);
+void tty_unlock_slave(struct tty_struct *tty);
+void tty_set_lock_subclass(struct tty_struct *tty);
 
 #endif
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index c20431d8def8..29e1cf178afb 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -327,11 +327,11 @@ struct tty_driver {
 
 extern struct list_head tty_drivers;
 
-extern struct tty_driver *__tty_alloc_driver(unsigned int lines,
-		struct module *owner, unsigned long flags);
-extern struct tty_driver *tty_find_polling_driver(char *name, int *line);
+struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner,
+		unsigned long flags);
+struct tty_driver *tty_find_polling_driver(char *name, int *line);
 
-extern void tty_driver_kref_put(struct tty_driver *driver);
+void tty_driver_kref_put(struct tty_driver *driver);
 
 /* Use TTY_DRIVER_* flags below */
 #define tty_alloc_driver(lines, flags) \
diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h
index 32284992b31a..9916acb5de49 100644
--- a/include/linux/tty_flip.h
+++ b/include/linux/tty_flip.h
@@ -7,16 +7,16 @@
 
 struct tty_ldisc;
 
-extern int tty_buffer_set_limit(struct tty_port *port, int limit);
-extern unsigned int tty_buffer_space_avail(struct tty_port *port);
-extern int tty_buffer_request_room(struct tty_port *port, size_t size);
-extern int tty_insert_flip_string_flags(struct tty_port *port,
+int tty_buffer_set_limit(struct tty_port *port, int limit);
+unsigned int tty_buffer_space_avail(struct tty_port *port);
+int tty_buffer_request_room(struct tty_port *port, size_t size);
+int tty_insert_flip_string_flags(struct tty_port *port,
 		const unsigned char *chars, const char *flags, size_t size);
-extern int tty_insert_flip_string_fixed_flag(struct tty_port *port,
+int tty_insert_flip_string_fixed_flag(struct tty_port *port,
 		const unsigned char *chars, char flag, size_t size);
-extern int tty_prepare_flip_string(struct tty_port *port,
-		unsigned char **chars, size_t size);
-extern void tty_flip_buffer_push(struct tty_port *port);
+int tty_prepare_flip_string(struct tty_port *port, unsigned char **chars,
+		size_t size);
+void tty_flip_buffer_push(struct tty_port *port);
 void tty_schedule_flip(struct tty_port *port);
 int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag);
 
@@ -45,7 +45,7 @@ static inline int tty_insert_flip_string(struct tty_port *port,
 int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
 		const char *f, int count);
 
-extern void tty_buffer_lock_exclusive(struct tty_port *port);
-extern void tty_buffer_unlock_exclusive(struct tty_port *port);
+void tty_buffer_lock_exclusive(struct tty_port *port);
+void tty_buffer_unlock_exclusive(struct tty_port *port);
 
 #endif /* _LINUX_TTY_FLIP_H */
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 1f37184eee63..4d1c128afbfa 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -146,7 +146,7 @@ struct ld_semaphore {
 #endif
 };
 
-extern void __init_ldsem(struct ld_semaphore *sem, const char *name,
+void __init_ldsem(struct ld_semaphore *sem, const char *name,
 			 struct lock_class_key *key);
 
 #define init_ldsem(sem)						\
@@ -157,18 +157,18 @@ do {								\
 } while (0)
 
 
-extern int ldsem_down_read(struct ld_semaphore *sem, long timeout);
-extern int ldsem_down_read_trylock(struct ld_semaphore *sem);
-extern int ldsem_down_write(struct ld_semaphore *sem, long timeout);
-extern int ldsem_down_write_trylock(struct ld_semaphore *sem);
-extern void ldsem_up_read(struct ld_semaphore *sem);
-extern void ldsem_up_write(struct ld_semaphore *sem);
+int ldsem_down_read(struct ld_semaphore *sem, long timeout);
+int ldsem_down_read_trylock(struct ld_semaphore *sem);
+int ldsem_down_write(struct ld_semaphore *sem, long timeout);
+int ldsem_down_write_trylock(struct ld_semaphore *sem);
+void ldsem_up_read(struct ld_semaphore *sem);
+void ldsem_up_write(struct ld_semaphore *sem);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-extern int ldsem_down_read_nested(struct ld_semaphore *sem, int subclass,
-				  long timeout);
-extern int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
-				   long timeout);
+int ldsem_down_read_nested(struct ld_semaphore *sem, int subclass,
+		long timeout);
+int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
+		long timeout);
 #else
 # define ldsem_down_read_nested(sem, subclass, timeout)		\
 		ldsem_down_read(sem, timeout)
-- 
cgit v1.2.3


From 28f194da4a2c4d431552025a4386edaffab181bd Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 14 Sep 2021 11:11:22 +0200
Subject: tty: make tty_ldisc_ops::hangup return void

The documentation says that the return value of tty_ldisc_ops::hangup
hook is ignored. And it really is, so there is no point for its return
type to be int. Switch it to void and all the hooks too.

Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Wolfgang Grandegger <wg@grandegger.com>
Cc: Marc Kleine-Budde <mkl@pengutronix.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Cc: Peter Ujfalusi <peter.ujfalusi@gmail.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210914091134.17426-4-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_ldisc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 4d1c128afbfa..b85d84fb5f49 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -199,7 +199,7 @@ struct tty_ldisc_ops {
 	void	(*set_termios)(struct tty_struct *tty, struct ktermios *old);
 	__poll_t (*poll)(struct tty_struct *, struct file *,
 			     struct poll_table_struct *);
-	int	(*hangup)(struct tty_struct *tty);
+	void	(*hangup)(struct tty_struct *tty);
 
 	/*
 	 * The following routines are called from below.
-- 
cgit v1.2.3


From dcc223e8b9bf3f987bcd3c327a6737022b39e35b Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 14 Sep 2021 11:11:23 +0200
Subject: tty: remove file from tty_mode_ioctl

The only user of 'file' parameter in tty_mode_ioctl is a BUG_ON check.
Provided it never crashed for anyone, it's an overkill to pass the
parameter to tty_mode_ioctl only for this check.

If we wanted to check 'file' there, we should handle it in more graceful
way anyway. Not by a BUG == crash.

Cc: Wolfgang Grandegger <wg@grandegger.com>
Cc: Marc Kleine-Budde <mkl@pengutronix.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Andreas Koensgen <ajk@comnets.uni-bremen.de>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210914091134.17426-5-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 15453b0c4081..5e73f577c2b4 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -369,8 +369,7 @@ int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);
 
 void tty_wakeup(struct tty_struct *tty);
 
-int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
-			unsigned int cmd, unsigned long arg);
+int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg);
 int tty_perform_flush(struct tty_struct *tty, unsigned long arg);
 struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx);
 void tty_release_struct(struct tty_struct *tty, int idx);
-- 
cgit v1.2.3


From 7c783601a3bc22a54cce0fb650259a983c8cafba Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 14 Sep 2021 11:11:24 +0200
Subject: tty: remove file from n_tty_ioctl_helper

After the previous patch, there are no users of 'file' in
n_tty_ioctl_helper. So remove it also from there.

Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210914091134.17426-6-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 5e73f577c2b4..5dbd7c5afac7 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -407,8 +407,8 @@ static inline int tty_audit_push(void)
 #endif
 
 /* tty_ioctl.c */
-int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file,
-		unsigned int cmd, unsigned long arg);
+int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd,
+		unsigned long arg);
 
 /* vt.c */
 
-- 
cgit v1.2.3


From 3229b906fb35b63515f0c703b917357c83e1ea22 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Thu, 16 Sep 2021 20:15:57 +0200
Subject: lib: devres: Add managed arch_phys_wc_add()

Add devm_arch_phys_wc_add() as managed wrapper around arch_phys_wc_add().
Useful for several graphics drivers that set framebuffer memory to write
combining.

v2:
	* fix typo in commit description

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210916181601.9146-2-tzimmermann@suse.de
---
 include/linux/io.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io.h b/include/linux/io.h
index 9595151d800d..fcd8ea79c5df 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -132,6 +132,8 @@ static inline int arch_phys_wc_index(int handle)
 #endif
 #endif
 
+int devm_arch_phys_wc_add(struct device *dev, unsigned long base, unsigned long size);
+
 enum {
 	/* See memremap() kernel-doc for usage description... */
 	MEMREMAP_WB = 1 << 0,
-- 
cgit v1.2.3


From c822310725ee41af663de2448094155d442ff871 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Thu, 16 Sep 2021 20:15:58 +0200
Subject: lib: devres: Add managed arch_io_reserve_memtype_wc()

Add devm_arch_io_reserve_memtype_wc() as managed wrapper around
arch_io_reserve_memtype_wc(). Useful for several graphics drivers
that set framebuffer memory to write combining.

v2:
	* fix typo in commit description

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210916181601.9146-3-tzimmermann@suse.de
---
 include/linux/io.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io.h b/include/linux/io.h
index fcd8ea79c5df..5fc800390fe4 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -168,4 +168,7 @@ static inline void arch_io_free_memtype_wc(resource_size_t base,
 }
 #endif
 
+int devm_arch_io_reserve_memtype_wc(struct device *dev, resource_size_t start,
+				    resource_size_t size);
+
 #endif /* _LINUX_IO_H */
-- 
cgit v1.2.3


From 23c64d7618a7e76e46db99444d184ef75498fb71 Mon Sep 17 00:00:00 2001
From: Piyush Mehta <piyush.mehta@xilinx.com>
Date: Wed, 22 Sep 2021 19:23:17 +0530
Subject: firmware: zynqmp: Add MMIO read and write support for PS_MODE pin

Add Xilinx ZynqMP firmware MMIO APIs support to set and get PS_MODE
pins value and status. These APIs create an interface path between
mode pin controller driver and low-level API to access GPIO pins.

Signed-off-by: Piyush Mehta <piyush.mehta@xilinx.com>
Acked-by: Michal Simek <michal.simek@xilinx.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 include/linux/firmware/xlnx-zynqmp.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 56b426fe020c..3917f89b2b3c 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -72,6 +72,8 @@ enum pm_api_id {
 	PM_SET_REQUIREMENT = 15,
 	PM_RESET_ASSERT = 17,
 	PM_RESET_GET_STATUS = 18,
+	PM_MMIO_WRITE = 19,
+	PM_MMIO_READ = 20,
 	PM_PM_INIT_FINALIZE = 21,
 	PM_FPGA_LOAD = 22,
 	PM_FPGA_GET_STATUS = 23,
@@ -390,6 +392,8 @@ int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type);
 int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset,
 			   const enum zynqmp_pm_reset_action assert_flag);
 int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status);
+unsigned int zynqmp_pm_bootmode_read(u32 *ps_mode);
+int zynqmp_pm_bootmode_write(u32 ps_mode);
 int zynqmp_pm_init_finalize(void);
 int zynqmp_pm_set_suspend_mode(u32 mode);
 int zynqmp_pm_request_node(const u32 node, const u32 capabilities,
@@ -520,6 +524,16 @@ static inline int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset,
 	return -ENODEV;
 }
 
+static inline unsigned int zynqmp_pm_bootmode_read(u32 *ps_mode)
+{
+	return -ENODEV;
+}
+
+static inline int zynqmp_pm_bootmode_write(u32 ps_mode)
+{
+	return -ENODEV;
+}
+
 static inline int zynqmp_pm_init_finalize(void)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 68a81bb2eebdeaabb04ae04c4d89c3bb5bec3b32 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 22 Sep 2021 16:57:03 +0300
Subject: net: dsa: sja1105: remove sp->dp

It looks like this field was never used since its introduction in commit
227d07a07ef1 ("net: dsa: sja1105: Add support for traffic through
standalone ports") remove it.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 171106202fe5..71b69ec52108 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -65,7 +65,6 @@ struct sja1105_port {
 	struct kthread_work xmit_work;
 	struct sk_buff_head xmit_queue;
 	struct sja1105_tagger_data *data;
-	struct dsa_port *dp;
 	bool hwts_tx_en;
 };
 
-- 
cgit v1.2.3


From 6d709cadfde68dbd12bef12fcced6222226dcb06 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 22 Sep 2021 17:37:25 +0300
Subject: net: dsa: move sja1110_process_meta_tstamp inside the tagging
 protocol driver

The problem is that DSA tagging protocols really must not depend on the
switch driver, because this creates a circular dependency at insmod
time, and the switch driver will effectively not load when the tagging
protocol driver is missing.

The code was structured in the way it was for a reason, though. The DSA
driver-facing API for PTP timestamping relies on the assumption that
two-step TX timestamps are provided by the hardware in an out-of-band
manner, typically by raising an interrupt and making that timestamp
available inside some sort of FIFO which is to be accessed over
SPI/MDIO/etc.

So the API puts .port_txtstamp into dsa_switch_ops, because it is
expected that the switch driver needs to save some state (like put the
skb into a queue until its TX timestamp arrives).

On SJA1110, TX timestamps are provided by the switch as Ethernet
packets, so this makes them be received and processed by the tagging
protocol driver. This in itself is great, because the timestamps are
full 64-bit and do not require reconstruction, and since Ethernet is the
fastest I/O method available to/from the switch, PTP timestamps arrive
very quickly, no matter how bottlenecked the SPI connection is, because
SPI interaction is not needed at all.

DSA's code structure and strict isolation between the tagging protocol
driver and the switch driver break the natural code organization.

When the tagging protocol driver receives a packet which is classified
as a metadata packet containing timestamps, it passes those timestamps
one by one to the switch driver, which then proceeds to compare them
based on the recorded timestamp ID that was generated in .port_txtstamp.

The communication between the tagging protocol and the switch driver is
done through a method exported by the switch driver, sja1110_process_meta_tstamp.
To satisfy build requirements, we force a dependency to build the
tagging protocol driver as a module when the switch driver is a module.
However, as explained in the first paragraph, that causes the circular
dependency.

To solve this, move the skb queue from struct sja1105_private :: struct
sja1105_ptp_data to struct sja1105_private :: struct sja1105_tagger_data.
The latter is a data structure for which hacks have already been put
into place to be able to create persistent storage per switch that is
accessible from the tagging protocol driver (see sja1105_setup_ports).

With the skb queue directly accessible from the tagging protocol driver,
we can now move sja1110_process_meta_tstamp into the tagging driver
itself, and avoid exporting a symbol.

Fixes: 566b18c8b752 ("net: dsa: sja1105: implement TX timestamping for SJA1110")
Link: https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 71b69ec52108..9d5a1053b276 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -48,6 +48,10 @@ struct sja1105_tagger_data {
 	spinlock_t meta_lock;
 	unsigned long state;
 	u8 ts_id;
+	/* Used on SJA1110 where meta frames are generated only for
+	 * 2-step TX timestamps
+	 */
+	struct sk_buff_head skb_txtstamp_queue;
 };
 
 struct sja1105_skb_cb {
@@ -68,25 +72,20 @@ struct sja1105_port {
 	bool hwts_tx_en;
 };
 
-enum sja1110_meta_tstamp {
-	SJA1110_META_TSTAMP_TX = 0,
-	SJA1110_META_TSTAMP_RX = 1,
-};
-
-#if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP)
-
-void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id,
-				 enum sja1110_meta_tstamp dir, u64 tstamp);
-
-#else
+/* Timestamps are in units of 8 ns clock ticks (equivalent to
+ * a fixed 125 MHz clock).
+ */
+#define SJA1105_TICK_NS			8
 
-static inline void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port,
-					       u8 ts_id, enum sja1110_meta_tstamp dir,
-					       u64 tstamp)
+static inline s64 ns_to_sja1105_ticks(s64 ns)
 {
+	return ns / SJA1105_TICK_NS;
 }
 
-#endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */
+static inline s64 sja1105_ticks_to_ns(s64 ticks)
+{
+	return ticks * SJA1105_TICK_NS;
+}
 
 #if IS_ENABLED(CONFIG_NET_DSA_SJA1105)
 
-- 
cgit v1.2.3


From f5aef4241592765a868611509e5170bbe8c89ae7 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 22 Sep 2021 17:37:26 +0300
Subject: net: dsa: sja1105: break dependency between dsa_port_is_sja1105 and
 switch driver

It's nice to be able to test a tagging protocol with dsa_loop, but not
at the cost of losing the ability of building the tagging protocol and
switch driver as modules, because as things stand, there is a circular
dependency between the two. Tagging protocol drivers cannot depend on
switch drivers, that is a hard fact.

The reasoning behind the blamed patch was that accessing dp->priv should
first make sure that the structure behind that pointer is what we really
think it is.

Currently the "sja1105" and "sja1110" tagging protocols only operate
with the sja1105 switch driver, just like any other tagging protocol and
switch combination. The only way to mix and match them is by modifying
the code, and this applies to dsa_loop as well (by default that uses
DSA_TAG_PROTO_NONE). So while in principle there is an issue, in
practice there isn't one.

Until we extend dsa_loop to allow user space configuration, treat the
problem as a non-issue and just say that DSA ports found by tag_sja1105
are always sja1105 ports, which is in fact true. But keep the
dsa_port_is_sja1105 function so that it's easy to patch it during
testing, and rely on dead code elimination.

Fixes: 994d2cbb08ca ("net: dsa: tag_sja1105: be dsa_loop-safe")
Link: https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 9d5a1053b276..e6c78be40bde 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -87,22 +87,9 @@ static inline s64 sja1105_ticks_to_ns(s64 ticks)
 	return ticks * SJA1105_TICK_NS;
 }
 
-#if IS_ENABLED(CONFIG_NET_DSA_SJA1105)
-
-extern const struct dsa_switch_ops sja1105_switch_ops;
-
-static inline bool dsa_port_is_sja1105(struct dsa_port *dp)
-{
-	return dp->ds->ops == &sja1105_switch_ops;
-}
-
-#else
-
 static inline bool dsa_port_is_sja1105(struct dsa_port *dp)
 {
-	return false;
+	return true;
 }
 
-#endif
-
 #endif /* _NET_DSA_SJA1105_H */
-- 
cgit v1.2.3


From d50497c4a05e73e76874fd0d4942036375a7ec0f Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Wed, 15 Sep 2021 18:46:27 -0700
Subject: platform/chrome: cros_ec_proto: Fix check_features ret val

The kerneldoc for cros_ec_check_features() states that it returns 1 or 0
depedending on whether a feature is supported or not, but it instead
returns a negative error number in one case, and a non-1 bitmask in
other cases.

Since all call-sites only check for a 1 or 0 return value, update
the function to return boolean values.

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20210916014632.2662612-1-pmalani@chromium.org
---
 include/linux/platform_data/cros_ec_proto.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 02599687770c..55844ece0b32 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -227,7 +227,7 @@ int cros_ec_get_next_event(struct cros_ec_device *ec_dev,
 
 u32 cros_ec_get_host_event(struct cros_ec_device *ec_dev);
 
-int cros_ec_check_features(struct cros_ec_dev *ec, int feature);
+bool cros_ec_check_features(struct cros_ec_dev *ec, int feature);
 
 int cros_ec_get_sensor_count(struct cros_ec_dev *ec);
 
-- 
cgit v1.2.3


From 3fd445a4d49fce594eecc90b9bbcf85cd223154d Mon Sep 17 00:00:00 2001
From: Len Baker <len.baker@gmx.com>
Date: Sat, 4 Sep 2021 11:22:17 +0200
Subject: brcmfmac: Replace zero-length array with flexible array member

There is a regular need in the kernel to provide a way to declare
having a dynamically sized set of trailing elements in a structure.
Kernel code should always use "flexible array members"[1] for these
cases. The older style of one-element or zero-length arrays should
no longer be used[2].

Also, make use of the struct_size() helper in devm_kzalloc().

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.14/process/deprecated.html#zero-length-and-one-element-arrays

Signed-off-by: Len Baker <len.baker@gmx.com>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Link: https://lore.kernel.org/r/20210904092217.2848-1-len.baker@gmx.com
---
 include/linux/platform_data/brcmfmac.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/brcmfmac.h b/include/linux/platform_data/brcmfmac.h
index 1d30bf278231..2b5676ff35be 100644
--- a/include/linux/platform_data/brcmfmac.h
+++ b/include/linux/platform_data/brcmfmac.h
@@ -125,7 +125,7 @@ struct brcmfmac_pd_cc_entry {
  */
 struct brcmfmac_pd_cc {
 	int				table_size;
-	struct brcmfmac_pd_cc_entry	table[0];
+	struct brcmfmac_pd_cc_entry	table[];
 };
 
 /**
-- 
cgit v1.2.3


From ae98f40d32cd0ee6fc222e765734ffa497a0a95e Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 23 Sep 2021 13:57:32 -0700
Subject: net: phy: broadcom: Fix PHY_BRCM_IDDQ_SUSPEND definition

An extraneous number was added during the inclusion of that change,
correct that such that we use a single bit as is expected by the PHY
driver.

Reported-by: Justin Chen <justinpopo6@gmail.com>
Fixes: d6da08ed1425 ("net: phy: broadcom: Add IDDQ-SR mode")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index b119d6819d6c..27d9b6683f0e 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -67,7 +67,7 @@
 #define PHY_BRCM_CLEAR_RGMII_MODE	0x00000004
 #define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00000008
 #define PHY_BRCM_EN_MASTER_MODE		0x00000010
-#define PHY_BRCM_IDDQ_SUSPEND		0x000000220
+#define PHY_BRCM_IDDQ_SUSPEND		0x00000020
 
 /* Broadcom BCM7xxx specific workarounds */
 #define PHY_BRCM_7XXX_REV(x)		(((x) >> 8) & 0xff)
-- 
cgit v1.2.3


From e7f18c22e6bea258ffd65185fdab66d1e63dd5bd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 19 Aug 2021 13:42:43 -0700
Subject: stddef: Fix kerndoc for sizeof_field() and offsetofend()

Adjust the comment styles so these are correctly identified as valid
kern-doc.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/stddef.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stddef.h b/include/linux/stddef.h
index 998a4ba28eba..8553b33143d1 100644
--- a/include/linux/stddef.h
+++ b/include/linux/stddef.h
@@ -20,7 +20,7 @@ enum {
 #endif
 
 /**
- * sizeof_field(TYPE, MEMBER)
+ * sizeof_field() - Report the size of a struct field in bytes
  *
  * @TYPE: The structure containing the field of interest
  * @MEMBER: The field to return the size of
@@ -28,7 +28,7 @@ enum {
 #define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
 
 /**
- * offsetofend(TYPE, MEMBER)
+ * offsetofend() - Report the offset of a struct field within the struct
  *
  * @TYPE: The type of the structure
  * @MEMBER: The member within the structure to get the end offset of
-- 
cgit v1.2.3


From 50d7bd38c3aafc4749e05e8d7fcb616979143602 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 17 May 2021 20:01:15 -0700
Subject: stddef: Introduce struct_group() helper macro

Kernel code has a regular need to describe groups of members within a
structure usually when they need to be copied or initialized separately
from the rest of the surrounding structure. The generally accepted design
pattern in C is to use a named sub-struct:

	struct foo {
		int one;
		struct {
			int two;
			int three, four;
		} thing;
		int five;
	};

This would allow for traditional references and sizing:

	memcpy(&dst.thing, &src.thing, sizeof(dst.thing));

However, doing this would mean that referencing struct members enclosed
by such named structs would always require including the sub-struct name
in identifiers:

	do_something(dst.thing.three);

This has tended to be quite inflexible, especially when such groupings
need to be added to established code which causes huge naming churn.
Three workarounds exist in the kernel for this problem, and each have
other negative properties.

To avoid the naming churn, there is a design pattern of adding macro
aliases for the named struct:

	#define f_three thing.three

This ends up polluting the global namespace, and makes it difficult to
search for identifiers.

Another common work-around in kernel code avoids the pollution by avoiding
the named struct entirely, instead identifying the group's boundaries using
either a pair of empty anonymous structs of a pair of zero-element arrays:

	struct foo {
		int one;
		struct { } start;
		int two;
		int three, four;
		struct { } finish;
		int five;
	};

	struct foo {
		int one;
		int start[0];
		int two;
		int three, four;
		int finish[0];
		int five;
	};

This allows code to avoid needing to use a sub-struct named for member
references within the surrounding structure, but loses the benefits of
being able to actually use such a struct, making it rather fragile. Using
these requires open-coded calculation of sizes and offsets. The efforts
made to avoid common mistakes include lots of comments, or adding various
BUILD_BUG_ON()s. Such code is left with no way for the compiler to reason
about the boundaries (e.g. the "start" object looks like it's 0 bytes
in length), making bounds checking depend on open-coded calculations:

	if (length > offsetof(struct foo, finish) -
		     offsetof(struct foo, start))
		return -EINVAL;
	memcpy(&dst.start, &src.start, offsetof(struct foo, finish) -
				       offsetof(struct foo, start));

However, the vast majority of places in the kernel that operate on
groups of members do so without any identification of the grouping,
relying either on comments or implicit knowledge of the struct contents,
which is even harder for the compiler to reason about, and results in
even more fragile manual sizing, usually depending on member locations
outside of the region (e.g. to copy "two" and "three", use the start of
"four" to find the size):

	BUILD_BUG_ON((offsetof(struct foo, four) <
		      offsetof(struct foo, two)) ||
		     (offsetof(struct foo, four) <
		      offsetof(struct foo, three));
	if (length > offsetof(struct foo, four) -
		     offsetof(struct foo, two))
		return -EINVAL;
	memcpy(&dst.two, &src.two, length);

In order to have a regular programmatic way to describe a struct
region that can be used for references and sizing, can be examined for
bounds checking, avoids forcing the use of intermediate identifiers,
and avoids polluting the global namespace, introduce the struct_group()
macro. This macro wraps the member declarations to create an anonymous
union of an anonymous struct (no intermediate name) and a named struct
(for references and sizing):

	struct foo {
		int one;
		struct_group(thing,
			int two;
			int three, four;
		);
		int five;
	};

	if (length > sizeof(src.thing))
		return -EINVAL;
	memcpy(&dst.thing, &src.thing, length);
	do_something(dst.three);

There are some rare cases where the resulting struct_group() needs
attributes added, so struct_group_attr() is also introduced to allow
for specifying struct attributes (e.g. __align(x) or __packed).
Additionally, there are places where such declarations would like to
have the struct be tagged, so struct_group_tagged() is added.

Given there is a need for a handful of UAPI uses too, the underlying
__struct_group() macro has been defined in UAPI so it can be used there
too.

To avoid confusing scripts/kernel-doc, hide the macro from its struct
parsing.

Co-developed-by: Keith Packard <keithp@keithp.com>
Signed-off-by: Keith Packard <keithp@keithp.com>
Acked-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/lkml/20210728023217.GC35706@embeddedor
Enhanced-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Link: https://lore.kernel.org/lkml/41183a98-bdb9-4ad6-7eab-5a7292a6df84@rasmusvillemoes.dk
Enhanced-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/lkml/1d9a2e6df2a9a35b2cdd50a9a68cac5991e7e5f0.camel@intel.com
Enhanced-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://lore.kernel.org/lkml/YQKa76A6XuFqgM03@phenom.ffwll.local
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/stddef.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stddef.h b/include/linux/stddef.h
index 8553b33143d1..8b103a53b000 100644
--- a/include/linux/stddef.h
+++ b/include/linux/stddef.h
@@ -36,4 +36,52 @@ enum {
 #define offsetofend(TYPE, MEMBER) \
 	(offsetof(TYPE, MEMBER)	+ sizeof_field(TYPE, MEMBER))
 
+/**
+ * struct_group() - Wrap a set of declarations in a mirrored struct
+ *
+ * @NAME: The identifier name of the mirrored sub-struct
+ * @MEMBERS: The member declarations for the mirrored structs
+ *
+ * Used to create an anonymous union of two structs with identical
+ * layout and size: one anonymous and one named. The former can be
+ * used normally without sub-struct naming, and the latter can be
+ * used to reason about the start, end, and size of the group of
+ * struct members.
+ */
+#define struct_group(NAME, MEMBERS...)	\
+	__struct_group(/* no tag */, NAME, /* no attrs */, MEMBERS)
+
+/**
+ * struct_group_attr() - Create a struct_group() with trailing attributes
+ *
+ * @NAME: The identifier name of the mirrored sub-struct
+ * @ATTRS: Any struct attributes to apply
+ * @MEMBERS: The member declarations for the mirrored structs
+ *
+ * Used to create an anonymous union of two structs with identical
+ * layout and size: one anonymous and one named. The former can be
+ * used normally without sub-struct naming, and the latter can be
+ * used to reason about the start, end, and size of the group of
+ * struct members. Includes structure attributes argument.
+ */
+#define struct_group_attr(NAME, ATTRS, MEMBERS...) \
+	__struct_group(/* no tag */, NAME, ATTRS, MEMBERS)
+
+/**
+ * struct_group_tagged() - Create a struct_group with a reusable tag
+ *
+ * @TAG: The tag name for the named sub-struct
+ * @NAME: The identifier name of the mirrored sub-struct
+ * @MEMBERS: The member declarations for the mirrored structs
+ *
+ * Used to create an anonymous union of two structs with identical
+ * layout and size: one anonymous and one named. The former can be
+ * used normally without sub-struct naming, and the latter can be
+ * used to reason about the start, end, and size of the group of
+ * struct members. Includes struct tag argument for the named copy,
+ * so the specified layout can be reused later.
+ */
+#define struct_group_tagged(TAG, NAME, MEMBERS...) \
+	__struct_group(TAG, NAME, /* no attrs */, MEMBERS)
+
 #endif
-- 
cgit v1.2.3


From c80d92fbb67b2c80b8eeb8759ee79d676eb33520 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 17 Jun 2021 22:48:05 -0700
Subject: compiler_types.h: Remove __compiletime_object_size()

Since all compilers support __builtin_object_size(), and there is only
one user of __compiletime_object_size, remove it to avoid the needless
indirection. This lets Clang reason about check_copy_size() correctly.

Link: https://github.com/ClangBuiltLinux/linux/issues/1179
Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marco Elver <elver@google.com>
Cc: Arvind Sankar <nivedita@alum.mit.edu>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Gabriel Krisman Bertazi <krisman@collabora.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/compiler-gcc.h   | 2 --
 include/linux/compiler_types.h | 5 -----
 include/linux/thread_info.h    | 2 +-
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index bd2b881c6b63..9957085b8148 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -41,8 +41,6 @@
 
 #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
 
-#define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
-
 #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__)
 #define __latent_entropy __attribute__((latent_entropy))
 #endif
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index b6ff83a714ca..05ceb2e92b0e 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -290,11 +290,6 @@ struct ftrace_likely_data {
 	(sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \
 	 sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
 
-/* Compile time object size, -1 for unknown */
-#ifndef __compiletime_object_size
-# define __compiletime_object_size(obj) -1
-#endif
-
 #ifdef __OPTIMIZE__
 # define __compiletime_assert(condition, msg, prefix, suffix)		\
 	do {								\
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 0999f6317978..ad0c4e041030 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -203,7 +203,7 @@ static inline void copy_overflow(int size, unsigned long count)
 static __always_inline __must_check bool
 check_copy_size(const void *addr, size_t bytes, bool is_source)
 {
-	int sz = __compiletime_object_size(addr);
+	int sz = __builtin_object_size(addr, 0);
 	if (unlikely(sz >= 0 && sz < bytes)) {
 		if (!__builtin_constant_p(bytes))
 			copy_overflow(sz, bytes);
-- 
cgit v1.2.3


From c430f60036af44079170ff71a461b9d7cf5ee431 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 14 Apr 2021 15:45:39 -0700
Subject: fortify: Move remaining fortify helpers into fortify-string.h

When commit a28a6e860c6c ("string.h: move fortified functions definitions
in a dedicated header.") moved the fortify-specific code, some helpers
were left behind. Move the remaining fortify-specific helpers into
fortify-string.h so they're together where they're used. This requires
that any FORTIFY helper function prototypes be conditionally built to
avoid "no prototype" warnings. Additionally removes unused helpers.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Daniel Axtens <dja@axtens.net>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Acked-by: Francis Laniel <laniel_francis@privacyrequired.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 7 +++++++
 include/linux/string.h         | 9 ---------
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index c1be37437e77..7e67d02764db 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -2,6 +2,13 @@
 #ifndef _LINUX_FORTIFY_STRING_H_
 #define _LINUX_FORTIFY_STRING_H_
 
+#define __FORTIFY_INLINE extern __always_inline __attribute__((gnu_inline))
+#define __RENAME(x) __asm__(#x)
+
+void fortify_panic(const char *name) __noreturn __cold;
+void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)");
+void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)");
+void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)");
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
diff --git a/include/linux/string.h b/include/linux/string.h
index 5e96d656be7a..ac1c769a5a80 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -249,15 +249,6 @@ static inline const char *kbasename(const char *path)
 	return tail ? tail + 1 : path;
 }
 
-#define __FORTIFY_INLINE extern __always_inline __attribute__((gnu_inline))
-#define __RENAME(x) __asm__(#x)
-
-void fortify_panic(const char *name) __noreturn __cold;
-void __read_overflow(void) __compiletime_error("detected read beyond size of object passed as 1st parameter");
-void __read_overflow2(void) __compiletime_error("detected read beyond size of object passed as 2nd parameter");
-void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter");
-void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter");
-
 #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
 #include <linux/fortify-string.h>
 #endif
-- 
cgit v1.2.3


From 072af0c638dc8a5c7db2edc4dddbd6d44bee3bdb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 2 Aug 2021 10:25:01 -0700
Subject: fortify: Fix dropped strcpy() compile-time write overflow check

The implementation for intra-object overflow in str*-family functions
accidentally dropped compile-time write overflow checking in strcpy(),
leaving it entirely to run-time. Add back the intended check.

Fixes: 6a39e62abbaf ("lib: string.h: detect intra-object overflow in fortified string functions")
Cc: Daniel Axtens <dja@axtens.net>
Cc: Francis Laniel <laniel_francis@privacyrequired.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
---
 include/linux/fortify-string.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 7e67d02764db..68bc5978d916 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -287,7 +287,10 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q)
 	if (p_size == (size_t)-1 && q_size == (size_t)-1)
 		return __underlying_strcpy(p, q);
 	size = strlen(q) + 1;
-	/* test here to use the more stringent object size */
+	/* Compile-time check for const size overflow. */
+	if (__builtin_constant_p(size) && p_size < size)
+		__write_overflow();
+	/* Run-time check for dynamic size overflow. */
 	if (p_size < size)
 		fortify_panic(__func__);
 	memcpy(p, q, size);
-- 
cgit v1.2.3


From 369cd2165d7beac1db144b40811baa2c6b7d8c54 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Aug 2021 14:20:14 -0700
Subject: fortify: Prepare to improve strnlen() and strlen() warnings

In order to have strlen() use fortified strnlen() internally, swap their
positions in the source. Doing this as part of later changes makes
review difficult, so reoroder it here; no code changes.

Cc: Francis Laniel <laniel_francis@privacyrequired.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
---
 include/linux/fortify-string.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 68bc5978d916..a3cb1d9aacce 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -56,6 +56,17 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q)
 	return p;
 }
 
+extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
+__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen)
+{
+	size_t p_size = __builtin_object_size(p, 1);
+	__kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
+
+	if (p_size <= ret && maxlen != ret)
+		fortify_panic(__func__);
+	return ret;
+}
+
 __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
 {
 	__kernel_size_t ret;
@@ -71,17 +82,6 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
 	return ret;
 }
 
-extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
-__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen)
-{
-	size_t p_size = __builtin_object_size(p, 1);
-	__kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
-
-	if (p_size <= ret && maxlen != ret)
-		fortify_panic(__func__);
-	return ret;
-}
-
 /* defined after fortified strlen to reuse it */
 extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
 __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
-- 
cgit v1.2.3


From 3009f891bb9f328945ebd5b71e12df7e2467f3dd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 2 Aug 2021 22:51:31 -0700
Subject: fortify: Allow strlen() and strnlen() to pass compile-time known
 lengths

Under CONFIG_FORTIFY_SOURCE, it is possible for the compiler to perform
strlen() and strnlen() at compile-time when the string size is known.
This is required to support compile-time overflow checking in strlcpy().

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 49 ++++++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index a3cb1d9aacce..fdb0a74c9ca2 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -10,6 +10,20 @@ void __read_overflow(void) __compiletime_error("detected read beyond size of obj
 void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)");
 void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)");
 
+#define __compiletime_strlen(p)				\
+({							\
+	unsigned char *__p = (unsigned char *)(p);      \
+	size_t ret = (size_t)-1;			\
+	size_t p_size = __builtin_object_size(p, 1);	\
+	if (p_size != (size_t)-1) {			\
+		size_t p_len = p_size - 1;		\
+		if (__builtin_constant_p(__p[p_len]) &&	\
+		    __p[p_len] == '\0')			\
+			ret = __builtin_strlen(__p);	\
+	}						\
+	ret;						\
+})
+
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
 extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
@@ -60,21 +74,31 @@ extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(st
 __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen)
 {
 	size_t p_size = __builtin_object_size(p, 1);
-	__kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
+	size_t p_len = __compiletime_strlen(p);
+	size_t ret;
+
+	/* We can take compile-time actions when maxlen is const. */
+	if (__builtin_constant_p(maxlen) && p_len != (size_t)-1) {
+		/* If p is const, we can use its compile-time-known len. */
+		if (maxlen >= p_size)
+			return p_len;
+	}
 
+	/* Do not check characters beyond the end of p. */
+	ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
 	if (p_size <= ret && maxlen != ret)
 		fortify_panic(__func__);
 	return ret;
 }
 
+/* defined after fortified strnlen to reuse it. */
 __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
 {
 	__kernel_size_t ret;
 	size_t p_size = __builtin_object_size(p, 1);
 
-	/* Work around gcc excess stack consumption issue */
-	if (p_size == (size_t)-1 ||
-		(__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0'))
+	/* Give up if we don't know how large p is. */
+	if (p_size == (size_t)-1)
 		return __underlying_strlen(p);
 	ret = strnlen(p, p_size);
 	if (p_size <= ret)
@@ -86,24 +110,27 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
 extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
 __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
 {
-	size_t ret;
 	size_t p_size = __builtin_object_size(p, 1);
 	size_t q_size = __builtin_object_size(q, 1);
+	size_t q_len;	/* Full count of source string length. */
+	size_t len;	/* Count of characters going into destination. */
 
 	if (p_size == (size_t)-1 && q_size == (size_t)-1)
 		return __real_strlcpy(p, q, size);
-	ret = strlen(q);
-	if (size) {
-		size_t len = (ret >= size) ? size - 1 : ret;
-
-		if (__builtin_constant_p(len) && len >= p_size)
+	q_len = strlen(q);
+	len = (q_len >= size) ? size - 1 : q_len;
+	if (__builtin_constant_p(size) && __builtin_constant_p(q_len) && size) {
+		/* Write size is always larger than destination. */
+		if (len >= p_size)
 			__write_overflow();
+	}
+	if (size) {
 		if (len >= p_size)
 			fortify_panic(__func__);
 		__underlying_memcpy(p, q, len);
 		p[len] = '\0';
 	}
-	return ret;
+	return q_len;
 }
 
 /* defined after fortified strnlen to reuse it */
-- 
cgit v1.2.3


From 405fca8a946168e71c04b82cc80727c3ea686e08 Mon Sep 17 00:00:00 2001
From: Wen Gong <wgong@codeaurora.org>
Date: Fri, 24 Sep 2021 06:00:47 -0400
Subject: ieee80211: add power type definition for 6 GHz

6 GHz regulatory domains introduces different modes for 6 GHz AP
operations: Low Power Indoor (LPI), Standard Power (SP) and Very
Low Power (VLP). 6 GHz STAs could be operated as either Regular or
Subordinate clients. Define the flags for power type
of AP and STATION mode.

Signed-off-by: Wen Gong <wgong@codeaurora.org>
Link: https://lore.kernel.org/r/20210924100052.32029-2-wgong@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 694264503119..420dea9aa648 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1988,6 +1988,44 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 			      int mcs, bool ext_nss_bw_capable,
 			      unsigned int max_vht_nss);
 
+/**
+ * enum ieee80211_ap_reg_power - regulatory power for a Access Point
+ *
+ * @IEEE80211_REG_UNSET_AP: Access Point has no regulatory power mode
+ * @IEEE80211_REG_LPI: Indoor Access Point
+ * @IEEE80211_REG_SP: Standard power Access Point
+ * @IEEE80211_REG_VLP: Very low power Access Point
+ * @IEEE80211_REG_AP_POWER_AFTER_LAST: internal
+ * @IEEE80211_REG_AP_POWER_MAX: maximum value
+ */
+enum ieee80211_ap_reg_power {
+	IEEE80211_REG_UNSET_AP,
+	IEEE80211_REG_LPI_AP,
+	IEEE80211_REG_SP_AP,
+	IEEE80211_REG_VLP_AP,
+	IEEE80211_REG_AP_POWER_AFTER_LAST,
+	IEEE80211_REG_AP_POWER_MAX =
+		IEEE80211_REG_AP_POWER_AFTER_LAST - 1,
+};
+
+/**
+ * enum ieee80211_client_reg_power - regulatory power for a client
+ *
+ * @IEEE80211_REG_UNSET_CLIENT: Client has no regulatory power mode
+ * @IEEE80211_REG_DEFAULT_CLIENT: Default Client
+ * @IEEE80211_REG_SUBORDINATE_CLIENT: Subordinate Client
+ * @IEEE80211_REG_CLIENT_POWER_AFTER_LAST: internal
+ * @IEEE80211_REG_CLIENT_POWER_MAX: maximum value
+ */
+enum ieee80211_client_reg_power {
+	IEEE80211_REG_UNSET_CLIENT,
+	IEEE80211_REG_DEFAULT_CLIENT,
+	IEEE80211_REG_SUBORDINATE_CLIENT,
+	IEEE80211_REG_CLIENT_POWER_AFTER_LAST,
+	IEEE80211_REG_CLIENT_POWER_MAX =
+		IEEE80211_REG_CLIENT_POWER_AFTER_LAST - 1,
+};
+
 /* 802.11ax HE MAC capabilities */
 #define IEEE80211_HE_MAC_CAP0_HTC_HE				0x01
 #define IEEE80211_HE_MAC_CAP0_TWT_REQ				0x02
-- 
cgit v1.2.3


From 930dfa563155179861470b2aba880eac2ae30bfb Mon Sep 17 00:00:00 2001
From: Min Li <min.li.xe@renesas.com>
Date: Fri, 24 Sep 2021 15:01:32 -0400
Subject: ptp: clockmatrix: use rsmu driver to access i2c/spi bus

rsmu (Renesas Synchronization Management Unit ) driver is located in
drivers/mfd and responsible for creating multiple devices including
clockmatrix phc, which will then use the exposed regmap and mutex
handle to access i2c/spi bus.

Signed-off-by: Min Li <min.li.xe@renesas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mfd/idt8a340_reg.h | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/idt8a340_reg.h b/include/linux/mfd/idt8a340_reg.h
index 92d763230bdf..a18c1539a152 100644
--- a/include/linux/mfd/idt8a340_reg.h
+++ b/include/linux/mfd/idt8a340_reg.h
@@ -506,6 +506,10 @@
 #define STATE_MODE_SHIFT                  (0)
 #define STATE_MODE_MASK                   (0x7)
 
+/* Bit definitions for the DPLL_MANU_REF_CFG register */
+#define MANUAL_REFERENCE_SHIFT            (0)
+#define MANUAL_REFERENCE_MASK             (0x1f)
+
 /* Bit definitions for the GPIO_CFG_GBL register */
 #define SUPPLY_MODE_SHIFT                 (0)
 #define SUPPLY_MODE_MASK                  (0x3)
@@ -654,7 +658,7 @@
 /* Values of DPLL_N.DPLL_MODE.PLL_MODE */
 enum pll_mode {
 	PLL_MODE_MIN = 0,
-	PLL_MODE_NORMAL = PLL_MODE_MIN,
+	PLL_MODE_PLL = PLL_MODE_MIN,
 	PLL_MODE_WRITE_PHASE = 1,
 	PLL_MODE_WRITE_FREQUENCY = 2,
 	PLL_MODE_GPIO_INC_DEC = 3,
@@ -664,6 +668,31 @@ enum pll_mode {
 	PLL_MODE_MAX = PLL_MODE_DISABLED,
 };
 
+/* Values of DPLL_CTRL_n.DPLL_MANU_REF_CFG.MANUAL_REFERENCE */
+enum manual_reference {
+	MANU_REF_MIN = 0,
+	MANU_REF_CLK0 = MANU_REF_MIN,
+	MANU_REF_CLK1,
+	MANU_REF_CLK2,
+	MANU_REF_CLK3,
+	MANU_REF_CLK4,
+	MANU_REF_CLK5,
+	MANU_REF_CLK6,
+	MANU_REF_CLK7,
+	MANU_REF_CLK8,
+	MANU_REF_CLK9,
+	MANU_REF_CLK10,
+	MANU_REF_CLK11,
+	MANU_REF_CLK12,
+	MANU_REF_CLK13,
+	MANU_REF_CLK14,
+	MANU_REF_CLK15,
+	MANU_REF_WRITE_PHASE,
+	MANU_REF_WRITE_FREQUENCY,
+	MANU_REF_XO_DPLL,
+	MANU_REF_MAX = MANU_REF_XO_DPLL,
+};
+
 enum hw_tod_write_trig_sel {
 	HW_TOD_WR_TRIG_SEL_MIN = 0,
 	HW_TOD_WR_TRIG_SEL_MSB = HW_TOD_WR_TRIG_SEL_MIN,
-- 
cgit v1.2.3


From f6bc909e7673c30abcbdb329e7d0aa2e83c103d7 Mon Sep 17 00:00:00 2001
From: Simon Trimmer <simont@opensource.cirrus.com>
Date: Mon, 13 Sep 2021 17:00:57 +0100
Subject: firmware: cs_dsp: add driver to support firmware loading on Cirrus
 Logic DSPs

wm_adsp originally provided firmware loading on some audio DSP and was
implemented as an ASoC codec driver. However, the firmware loading now
covers a wider range of DSP cores and peripherals containing them,
beyond just audio. So it needs to be available to non-audio drivers. All
the core firmware loading support has been moved into a new driver
cs_dsp, leaving only the ASoC-specific parts in wm_adsp.

Signed-off-by: Simon Trimmer <simont@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20210913160057.103842-17-simont@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/firmware/cirrus/cs_dsp.h | 242 +++++++++++++++++++++++++++++++++
 include/linux/firmware/cirrus/wmfw.h   | 202 +++++++++++++++++++++++++++
 2 files changed, 444 insertions(+)
 create mode 100644 include/linux/firmware/cirrus/cs_dsp.h
 create mode 100644 include/linux/firmware/cirrus/wmfw.h

(limited to 'include/linux')

diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
new file mode 100644
index 000000000000..9ad9eaaaa552
--- /dev/null
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * cs_dsp.h  --  Cirrus Logic DSP firmware support
+ *
+ * Based on sound/soc/codecs/wm_adsp.h
+ *
+ * Copyright 2012 Wolfson Microelectronics plc
+ * Copyright (C) 2015-2021 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ */
+#ifndef __CS_DSP_H
+#define __CS_DSP_H
+
+#define CS_ADSP2_REGION_0 BIT(0)
+#define CS_ADSP2_REGION_1 BIT(1)
+#define CS_ADSP2_REGION_2 BIT(2)
+#define CS_ADSP2_REGION_3 BIT(3)
+#define CS_ADSP2_REGION_4 BIT(4)
+#define CS_ADSP2_REGION_5 BIT(5)
+#define CS_ADSP2_REGION_6 BIT(6)
+#define CS_ADSP2_REGION_7 BIT(7)
+#define CS_ADSP2_REGION_8 BIT(8)
+#define CS_ADSP2_REGION_9 BIT(9)
+#define CS_ADSP2_REGION_1_9 (CS_ADSP2_REGION_1 | \
+		CS_ADSP2_REGION_2 | CS_ADSP2_REGION_3 | \
+		CS_ADSP2_REGION_4 | CS_ADSP2_REGION_5 | \
+		CS_ADSP2_REGION_6 | CS_ADSP2_REGION_7 | \
+		CS_ADSP2_REGION_8 | CS_ADSP2_REGION_9)
+#define CS_ADSP2_REGION_ALL (CS_ADSP2_REGION_0 | CS_ADSP2_REGION_1_9)
+
+#define CS_DSP_DATA_WORD_SIZE                3
+
+#define CS_DSP_ACKED_CTL_TIMEOUT_MS          100
+#define CS_DSP_ACKED_CTL_N_QUICKPOLLS        10
+#define CS_DSP_ACKED_CTL_MIN_VALUE           0
+#define CS_DSP_ACKED_CTL_MAX_VALUE           0xFFFFFF
+
+/**
+ * struct cs_dsp_region - Describes a logical memory region in DSP address space
+ * @type:	Memory region type
+ * @base:	Address of region
+ */
+struct cs_dsp_region {
+	int type;
+	unsigned int base;
+};
+
+/**
+ * struct cs_dsp_alg_region - Describes a logical algorithm region in DSP address space
+ * @list:	List node for internal use
+ * @alg:	Algorithm id
+ * @type:	Memory region type
+ * @base:	Address of region
+ */
+struct cs_dsp_alg_region {
+	struct list_head list;
+	unsigned int alg;
+	int type;
+	unsigned int base;
+};
+
+/**
+ * struct cs_dsp_coeff_ctl - Describes a coefficient control
+ * @fw_name:		Name of the firmware
+ * @subname:		Name of the control parsed from the WMFW
+ * @subname_len:	Length of subname
+ * @alg_region:		Logical region associated with this control
+ * @dsp:		DSP instance associated with this control
+ * @enabled:		Flag indicating whether control is enabled
+ * @list:		List node for internal use
+ * @cache:		Cached value of the control
+ * @offset:		Offset of control within alg_region
+ * @len:		Length of the cached value
+ * @set:		Flag indicating the value has been written by the user
+ * @flags:		Bitfield of WMFW_CTL_FLAG_ control flags defined in wmfw.h
+ * @type:		One of the WMFW_CTL_TYPE_ control types defined in wmfw.h
+ * @priv:		For use by the client
+ */
+struct cs_dsp_coeff_ctl {
+	const char *fw_name;
+	/* Subname is needed to match with firmware */
+	const char *subname;
+	unsigned int subname_len;
+	struct cs_dsp_alg_region alg_region;
+	struct cs_dsp *dsp;
+	unsigned int enabled:1;
+	struct list_head list;
+	void *cache;
+	unsigned int offset;
+	size_t len;
+	unsigned int set:1;
+	unsigned int flags;
+	unsigned int type;
+
+	void *priv;
+};
+
+struct cs_dsp_ops;
+struct cs_dsp_client_ops;
+
+/**
+ * struct cs_dsp - Configuration and state of a Cirrus Logic DSP
+ * @name:		The name of the DSP instance
+ * @rev:		Revision of the DSP
+ * @num:		DSP instance number
+ * @type:		Type of DSP
+ * @dev:		Driver model representation of the device
+ * @regmap:		Register map of the device
+ * @ops:		Function pointers for internal callbacks
+ * @client_ops:		Function pointers for client callbacks
+ * @base:		Address of the DSP registers
+ * @base_sysinfo:	Address of the sysinfo register (Halo only)
+ * @sysclk_reg:		Address of the sysclk register (ADSP1 only)
+ * @sysclk_mask:	Mask of frequency bits within sysclk register (ADSP1 only)
+ * @sysclk_shift:	Shift of frequency bits within sysclk register (ADSP1 only)
+ * @alg_regions:	List of currently loaded algorithm regions
+ * @fw_file_name:	Filename of the current firmware
+ * @fw_name:		Name of the current firmware
+ * @fw_id:		ID of the current firmware, obtained from the wmfw
+ * @fw_id_version:	Version of the firmware, obtained from the wmfw
+ * @fw_vendor_id:	Vendor of the firmware, obtained from the wmfw
+ * @mem:		DSP memory region descriptions
+ * @num_mems:		Number of memory regions in this DSP
+ * @fw_ver:		Version of the wmfw file format
+ * @booted:		Flag indicating DSP has been configured
+ * @running:		Flag indicating DSP is executing firmware
+ * @ctl_list:		Controls defined within the loaded DSP firmware
+ * @lock_regions:	Enable MPU traps on specified memory regions
+ * @pwr_lock:		Lock used to serialize accesses
+ * @debugfs_root:	Debugfs directory for this DSP instance
+ * @wmfw_file_name:	Filename of the currently loaded firmware
+ * @bin_file_name:	Filename of the currently loaded coefficients
+ */
+struct cs_dsp {
+	const char *name;
+	int rev;
+	int num;
+	int type;
+	struct device *dev;
+	struct regmap *regmap;
+
+	const struct cs_dsp_ops *ops;
+	const struct cs_dsp_client_ops *client_ops;
+
+	unsigned int base;
+	unsigned int base_sysinfo;
+	unsigned int sysclk_reg;
+	unsigned int sysclk_mask;
+	unsigned int sysclk_shift;
+
+	struct list_head alg_regions;
+
+	const char *fw_name;
+	unsigned int fw_id;
+	unsigned int fw_id_version;
+	unsigned int fw_vendor_id;
+
+	const struct cs_dsp_region *mem;
+	int num_mems;
+
+	int fw_ver;
+
+	bool booted;
+	bool running;
+
+	struct list_head ctl_list;
+
+	struct mutex pwr_lock;
+
+	unsigned int lock_regions;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_root;
+	char *wmfw_file_name;
+	char *bin_file_name;
+#endif
+};
+
+/**
+ * struct cs_dsp_client_ops - client callbacks
+ * @control_add:	Called under the pwr_lock when a control is created
+ * @control_remove:	Called under the pwr_lock when a control is destroyed
+ * @post_run:		Called under the pwr_lock by cs_dsp_run()
+ * @post_stop:		Called under the pwr_lock by cs_dsp_stop()
+ * @watchdog_expired:	Called when a watchdog expiry is detected
+ *
+ * These callbacks give the cs_dsp client an opportunity to respond to events
+ * or to perform actions atomically.
+ */
+struct cs_dsp_client_ops {
+	int (*control_add)(struct cs_dsp_coeff_ctl *ctl);
+	void (*control_remove)(struct cs_dsp_coeff_ctl *ctl);
+	int (*post_run)(struct cs_dsp *dsp);
+	void (*post_stop)(struct cs_dsp *dsp);
+	void (*watchdog_expired)(struct cs_dsp *dsp);
+};
+
+int cs_dsp_adsp1_init(struct cs_dsp *dsp);
+int cs_dsp_adsp2_init(struct cs_dsp *dsp);
+int cs_dsp_halo_init(struct cs_dsp *dsp);
+
+int cs_dsp_adsp1_power_up(struct cs_dsp *dsp,
+			  const struct firmware *wmfw_firmware, char *wmfw_filename,
+			  const struct firmware *coeff_firmware, char *coeff_filename,
+			  const char *fw_name);
+void cs_dsp_adsp1_power_down(struct cs_dsp *dsp);
+int cs_dsp_power_up(struct cs_dsp *dsp,
+		    const struct firmware *wmfw_firmware, char *wmfw_filename,
+		    const struct firmware *coeff_firmware, char *coeff_filename,
+		    const char *fw_name);
+void cs_dsp_power_down(struct cs_dsp *dsp);
+int cs_dsp_run(struct cs_dsp *dsp);
+void cs_dsp_stop(struct cs_dsp *dsp);
+
+void cs_dsp_remove(struct cs_dsp *dsp);
+
+int cs_dsp_set_dspclk(struct cs_dsp *dsp, unsigned int freq);
+void cs_dsp_adsp2_bus_error(struct cs_dsp *dsp);
+void cs_dsp_halo_bus_error(struct cs_dsp *dsp);
+void cs_dsp_halo_wdt_expire(struct cs_dsp *dsp);
+
+void cs_dsp_init_debugfs(struct cs_dsp *dsp, struct dentry *debugfs_root);
+void cs_dsp_cleanup_debugfs(struct cs_dsp *dsp);
+
+int cs_dsp_coeff_write_acked_control(struct cs_dsp_coeff_ctl *ctl, unsigned int event_id);
+int cs_dsp_coeff_write_ctrl(struct cs_dsp_coeff_ctl *ctl, const void *buf, size_t len);
+int cs_dsp_coeff_read_ctrl(struct cs_dsp_coeff_ctl *ctl, void *buf, size_t len);
+struct cs_dsp_coeff_ctl *cs_dsp_get_ctl(struct cs_dsp *dsp, const char *name, int type,
+					unsigned int alg);
+
+int cs_dsp_read_raw_data_block(struct cs_dsp *dsp, int mem_type, unsigned int mem_addr,
+			       unsigned int num_words, __be32 *data);
+int cs_dsp_read_data_word(struct cs_dsp *dsp, int mem_type, unsigned int mem_addr, u32 *data);
+int cs_dsp_write_data_word(struct cs_dsp *dsp, int mem_type, unsigned int mem_addr, u32 data);
+void cs_dsp_remove_padding(u32 *buf, int nwords);
+
+struct cs_dsp_alg_region *cs_dsp_find_alg_region(struct cs_dsp *dsp,
+						 int type, unsigned int id);
+
+const char *cs_dsp_mem_region_name(unsigned int type);
+
+#endif
diff --git a/include/linux/firmware/cirrus/wmfw.h b/include/linux/firmware/cirrus/wmfw.h
new file mode 100644
index 000000000000..a19bf7c6fc8b
--- /dev/null
+++ b/include/linux/firmware/cirrus/wmfw.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * wmfw.h - Wolfson firmware format information
+ *
+ * Copyright 2012 Wolfson Microelectronics plc
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ */
+
+#ifndef __WMFW_H
+#define __WMFW_H
+
+#include <linux/types.h>
+
+#define WMFW_MAX_ALG_NAME         256
+#define WMFW_MAX_ALG_DESCR_NAME   256
+
+#define WMFW_MAX_COEFF_NAME       256
+#define WMFW_MAX_COEFF_DESCR_NAME 256
+
+#define WMFW_CTL_FLAG_SYS         0x8000
+#define WMFW_CTL_FLAG_VOLATILE    0x0004
+#define WMFW_CTL_FLAG_WRITEABLE   0x0002
+#define WMFW_CTL_FLAG_READABLE    0x0001
+
+#define WMFW_CTL_TYPE_BYTES       0x0004 /* byte control */
+
+/* Non-ALSA coefficient types start at 0x1000 */
+#define WMFW_CTL_TYPE_ACKED       0x1000 /* acked control */
+#define WMFW_CTL_TYPE_HOSTEVENT   0x1001 /* event control */
+#define WMFW_CTL_TYPE_HOST_BUFFER 0x1002 /* host buffer pointer */
+
+struct wmfw_header {
+	char magic[4];
+	__le32 len;
+	__le16 rev;
+	u8 core;
+	u8 ver;
+} __packed;
+
+struct wmfw_footer {
+	__le64 timestamp;
+	__le32 checksum;
+} __packed;
+
+struct wmfw_adsp1_sizes {
+	__le32 dm;
+	__le32 pm;
+	__le32 zm;
+} __packed;
+
+struct wmfw_adsp2_sizes {
+	__le32 xm;
+	__le32 ym;
+	__le32 pm;
+	__le32 zm;
+} __packed;
+
+struct wmfw_region {
+	union {
+		__be32 type;
+		__le32 offset;
+	};
+	__le32 len;
+	u8 data[];
+} __packed;
+
+struct wmfw_id_hdr {
+	__be32 core_id;
+	__be32 core_rev;
+	__be32 id;
+	__be32 ver;
+} __packed;
+
+struct wmfw_v3_id_hdr {
+	__be32 core_id;
+	__be32 block_rev;
+	__be32 vendor_id;
+	__be32 id;
+	__be32 ver;
+} __packed;
+
+struct wmfw_adsp1_id_hdr {
+	struct wmfw_id_hdr fw;
+	__be32 zm;
+	__be32 dm;
+	__be32 n_algs;
+} __packed;
+
+struct wmfw_adsp2_id_hdr {
+	struct wmfw_id_hdr fw;
+	__be32 zm;
+	__be32 xm;
+	__be32 ym;
+	__be32 n_algs;
+} __packed;
+
+struct wmfw_halo_id_hdr {
+	struct wmfw_v3_id_hdr fw;
+	__be32 xm_base;
+	__be32 xm_size;
+	__be32 ym_base;
+	__be32 ym_size;
+	__be32 n_algs;
+} __packed;
+
+struct wmfw_alg_hdr {
+	__be32 id;
+	__be32 ver;
+} __packed;
+
+struct wmfw_adsp1_alg_hdr {
+	struct wmfw_alg_hdr alg;
+	__be32 zm;
+	__be32 dm;
+} __packed;
+
+struct wmfw_adsp2_alg_hdr {
+	struct wmfw_alg_hdr alg;
+	__be32 zm;
+	__be32 xm;
+	__be32 ym;
+} __packed;
+
+struct wmfw_halo_alg_hdr {
+	struct wmfw_alg_hdr alg;
+	__be32 xm_base;
+	__be32 xm_size;
+	__be32 ym_base;
+	__be32 ym_size;
+} __packed;
+
+struct wmfw_adsp_alg_data {
+	__le32 id;
+	u8 name[WMFW_MAX_ALG_NAME];
+	u8 descr[WMFW_MAX_ALG_DESCR_NAME];
+	__le32 ncoeff;
+	u8 data[];
+} __packed;
+
+struct wmfw_adsp_coeff_data {
+	struct {
+		__le16 offset;
+		__le16 type;
+		__le32 size;
+	} hdr;
+	u8 name[WMFW_MAX_COEFF_NAME];
+	u8 descr[WMFW_MAX_COEFF_DESCR_NAME];
+	__le16 ctl_type;
+	__le16 flags;
+	__le32 len;
+	u8 data[];
+} __packed;
+
+struct wmfw_coeff_hdr {
+	u8 magic[4];
+	__le32 len;
+	union {
+		__be32 rev;
+		__le32 ver;
+	};
+	union {
+		__be32 core;
+		__le32 core_ver;
+	};
+	u8 data[];
+} __packed;
+
+struct wmfw_coeff_item {
+	__le16 offset;
+	__le16 type;
+	__le32 id;
+	__le32 ver;
+	__le32 sr;
+	__le32 len;
+	u8 data[];
+} __packed;
+
+#define WMFW_ADSP1 1
+#define WMFW_ADSP2 2
+#define WMFW_HALO 4
+
+#define WMFW_ABSOLUTE         0xf0
+#define WMFW_ALGORITHM_DATA   0xf2
+#define WMFW_METADATA         0xfc
+#define WMFW_NAME_TEXT        0xfe
+#define WMFW_INFO_TEXT        0xff
+
+#define WMFW_ADSP1_PM 2
+#define WMFW_ADSP1_DM 3
+#define WMFW_ADSP1_ZM 4
+
+#define WMFW_ADSP2_PM 2
+#define WMFW_ADSP2_ZM 4
+#define WMFW_ADSP2_XM 5
+#define WMFW_ADSP2_YM 6
+
+#define WMFW_HALO_PM_PACKED 0x10
+#define WMFW_HALO_XM_PACKED 0x11
+#define WMFW_HALO_YM_PACKED 0x12
+
+#endif
-- 
cgit v1.2.3


From c25303281d79299e9f35d4b2e496a8bd134d5715 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 5 Jun 2021 23:20:17 -0400
Subject: mm: Convert get_page_unless_zero() to return bool

atomic_add_unless() returns bool, so remove the widening casts to int
in page_ref_add_unless() and get_page_unless_zero().  This causes gcc
to produce slightly larger code in isolate_migratepages_block(), but
it's not clear that it's worse code.  Net +19 bytes of text.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm.h       | 2 +-
 include/linux/page_ref.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 73a52aba448f..1de8864a1e28 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -754,7 +754,7 @@ static inline int put_page_testzero(struct page *page)
  * This can be called when MMU is off so it must not access
  * any of the virtual mappings.
  */
-static inline int get_page_unless_zero(struct page *page)
+static inline bool get_page_unless_zero(struct page *page)
 {
 	return page_ref_add_unless(page, 1, 0);
 }
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 7ad46f45df39..3a799de8ad52 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -161,9 +161,9 @@ static inline int page_ref_dec_return(struct page *page)
 	return ret;
 }
 
-static inline int page_ref_add_unless(struct page *page, int nr, int u)
+static inline bool page_ref_add_unless(struct page *page, int nr, int u)
 {
-	int ret = atomic_add_unless(&page->_refcount, nr, u);
+	bool ret = atomic_add_unless(&page->_refcount, nr, u);
 
 	if (page_ref_tracepoint_active(page_ref_mod_unless))
 		__page_ref_mod_unless(page, nr, ret);
-- 
cgit v1.2.3


From 7b230db3b8d373219f88a3d25c8fbbf12cc7f233 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 6 Dec 2020 22:22:48 -0500
Subject: mm: Introduce struct folio

A struct folio is a new abstraction to replace the venerable struct page.
A function which takes a struct folio argument declares that it will
operate on the entire (possibly compound) page, not just PAGE_SIZE bytes.
In return, the caller guarantees that the pointer it is passing does
not point to a tail page.  No change to generated code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/mm.h         | 75 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h   | 60 +++++++++++++++++++++++++++++++++++++
 include/linux/page-flags.h | 28 +++++++++++++++++
 3 files changed, 163 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1de8864a1e28..9057b8406acf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -950,6 +950,20 @@ static inline unsigned int compound_order(struct page *page)
 	return page[1].compound_order;
 }
 
+/**
+ * folio_order - The allocation order of a folio.
+ * @folio: The folio.
+ *
+ * A folio is composed of 2^order pages.  See get_order() for the definition
+ * of order.
+ *
+ * Return: The order of the folio.
+ */
+static inline unsigned int folio_order(struct folio *folio)
+{
+	return compound_order(&folio->page);
+}
+
 static inline bool hpage_pincount_available(struct page *page)
 {
 	/*
@@ -1595,6 +1609,66 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
 #endif
 }
 
+/**
+ * folio_nr_pages - The number of pages in the folio.
+ * @folio: The folio.
+ *
+ * Return: A positive power of two.
+ */
+static inline long folio_nr_pages(struct folio *folio)
+{
+	return compound_nr(&folio->page);
+}
+
+/**
+ * folio_next - Move to the next physical folio.
+ * @folio: The folio we're currently operating on.
+ *
+ * If you have physically contiguous memory which may span more than
+ * one folio (eg a &struct bio_vec), use this function to move from one
+ * folio to the next.  Do not use it if the memory is only virtually
+ * contiguous as the folios are almost certainly not adjacent to each
+ * other.  This is the folio equivalent to writing ``page++``.
+ *
+ * Context: We assume that the folios are refcounted and/or locked at a
+ * higher level and do not adjust the reference counts.
+ * Return: The next struct folio.
+ */
+static inline struct folio *folio_next(struct folio *folio)
+{
+	return (struct folio *)folio_page(folio, folio_nr_pages(folio));
+}
+
+/**
+ * folio_shift - The size of the memory described by this folio.
+ * @folio: The folio.
+ *
+ * A folio represents a number of bytes which is a power-of-two in size.
+ * This function tells you which power-of-two the folio is.  See also
+ * folio_size() and folio_order().
+ *
+ * Context: The caller should have a reference on the folio to prevent
+ * it from being split.  It is not necessary for the folio to be locked.
+ * Return: The base-2 logarithm of the size of this folio.
+ */
+static inline unsigned int folio_shift(struct folio *folio)
+{
+	return PAGE_SHIFT + folio_order(folio);
+}
+
+/**
+ * folio_size - The number of bytes in a folio.
+ * @folio: The folio.
+ *
+ * Context: The caller should have a reference on the folio to prevent
+ * it from being split.  It is not necessary for the folio to be locked.
+ * Return: The number of bytes in this folio.
+ */
+static inline size_t folio_size(struct folio *folio)
+{
+	return PAGE_SIZE << folio_order(folio);
+}
+
 /*
  * Some inline functions in vmstat.h depend on page_zone()
  */
@@ -1700,6 +1774,7 @@ extern void pagefault_out_of_memory(void);
 
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 #define offset_in_thp(page, p)	((unsigned long)(p) & (thp_size(page) - 1))
+#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
 
 /*
  * Flags passed to show_mem() and show_free_areas() to suppress output in
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f8ee09c711f..6908186629b4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -239,6 +239,66 @@ struct page {
 #endif
 } _struct_page_alignment;
 
+/**
+ * struct folio - Represents a contiguous set of bytes.
+ * @flags: Identical to the page flags.
+ * @lru: Least Recently Used list; tracks how recently this folio was used.
+ * @mapping: The file this page belongs to, or refers to the anon_vma for
+ *    anonymous memory.
+ * @index: Offset within the file, in units of pages.  For anonymous memory,
+ *    this is the index from the beginning of the mmap.
+ * @private: Filesystem per-folio data (see folio_attach_private()).
+ *    Used for swp_entry_t if folio_test_swapcache().
+ * @_mapcount: Do not access this member directly.  Use folio_mapcount() to
+ *    find out how many times this folio is mapped by userspace.
+ * @_refcount: Do not access this member directly.  Use folio_ref_count()
+ *    to find how many references there are to this folio.
+ * @memcg_data: Memory Control Group data.
+ *
+ * A folio is a physically, virtually and logically contiguous set
+ * of bytes.  It is a power-of-two in size, and it is aligned to that
+ * same power-of-two.  It is at least as large as %PAGE_SIZE.  If it is
+ * in the page cache, it is at a file offset which is a multiple of that
+ * power-of-two.  It may be mapped into userspace at an address which is
+ * at an arbitrary page offset, but its kernel virtual address is aligned
+ * to its size.
+ */
+struct folio {
+	/* private: don't document the anon union */
+	union {
+		struct {
+	/* public: */
+			unsigned long flags;
+			struct list_head lru;
+			struct address_space *mapping;
+			pgoff_t index;
+			void *private;
+			atomic_t _mapcount;
+			atomic_t _refcount;
+#ifdef CONFIG_MEMCG
+			unsigned long memcg_data;
+#endif
+	/* private: the union with struct page is transitional */
+		};
+		struct page page;
+	};
+};
+
+static_assert(sizeof(struct page) == sizeof(struct folio));
+#define FOLIO_MATCH(pg, fl)						\
+	static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl))
+FOLIO_MATCH(flags, flags);
+FOLIO_MATCH(lru, lru);
+FOLIO_MATCH(compound_head, lru);
+FOLIO_MATCH(index, index);
+FOLIO_MATCH(private, private);
+FOLIO_MATCH(_mapcount, _mapcount);
+FOLIO_MATCH(_refcount, _refcount);
+#ifdef CONFIG_MEMCG
+FOLIO_MATCH(memcg_data, memcg_data);
+#endif
+#undef FOLIO_MATCH
+
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
 {
 	return &page[1].compound_mapcount;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a558d67ee86f..e9b0723a637d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -193,6 +193,34 @@ static inline unsigned long _compound_head(const struct page *page)
 
 #define compound_head(page)	((typeof(page))_compound_head(page))
 
+/**
+ * page_folio - Converts from page to folio.
+ * @p: The page.
+ *
+ * Every page is part of a folio.  This function cannot be called on a
+ * NULL pointer.
+ *
+ * Context: No reference, nor lock is required on @page.  If the caller
+ * does not hold a reference, this call may race with a folio split, so
+ * it should re-check the folio still contains this page after gaining
+ * a reference on the folio.
+ * Return: The folio which contains this page.
+ */
+#define page_folio(p)		(_Generic((p),				\
+	const struct page *:	(const struct folio *)_compound_head(p), \
+	struct page *:		(struct folio *)_compound_head(p)))
+
+/**
+ * folio_page - Return a page from a folio.
+ * @folio: The folio.
+ * @n: The page number to return.
+ *
+ * @n is relative to the start of the folio.  This function does not
+ * check that the page number lies within @folio; the caller is presumed
+ * to have a reference to the page.
+ */
+#define folio_page(folio, n)	nth_page(&(folio)->page, n)
+
 static __always_inline int PageTail(struct page *page)
 {
 	return READ_ONCE(page->compound_head) & 1;
-- 
cgit v1.2.3


From 32b8fc486524044f2643c5d3340fa436b761ba71 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 18 Jan 2021 07:40:36 -0500
Subject: mm: Add folio_pgdat(), folio_zone() and folio_zonenum()

These are just convenience wrappers for callers with folios; pgdat and
zone can be reached from tail pages as well as head pages.  No change
to generated code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/mm.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9057b8406acf..3fc524f8c2a2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1145,6 +1145,11 @@ static inline enum zone_type page_zonenum(const struct page *page)
 	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
 
+static inline enum zone_type folio_zonenum(const struct folio *folio)
+{
+	return page_zonenum(&folio->page);
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 static inline bool is_zone_device_page(const struct page *page)
 {
@@ -1560,6 +1565,16 @@ static inline pg_data_t *page_pgdat(const struct page *page)
 	return NODE_DATA(page_to_nid(page));
 }
 
+static inline struct zone *folio_zone(const struct folio *folio)
+{
+	return page_zone(&folio->page);
+}
+
+static inline pg_data_t *folio_pgdat(const struct folio *folio)
+{
+	return page_pgdat(&folio->page);
+}
+
 #ifdef SECTION_IN_PAGE_FLAGS
 static inline void set_page_section(struct page *page, unsigned long section)
 {
-- 
cgit v1.2.3


From a53e17e4e97b4519882d640984a386c522e9fba3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 18 Jan 2021 08:14:00 -0500
Subject: mm/vmstat: Add functions to account folio statistics

Allow page counters to be more readily modified by callers which have
a folio.  Name these wrappers with 'stat' instead of 'state' as requested
by Linus here:
https://lore.kernel.org/linux-mm/CAHk-=wj847SudR-kt+46fT3+xFFgiwpgThvm7DJWGdi4cVrbnQ@mail.gmail.com/
No change to generated code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/vmstat.h | 107 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index d6a6cf53b127..241bd0f53fb9 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -415,6 +415,78 @@ static inline void drain_zonestat(struct zone *zone,
 			struct per_cpu_zonestat *pzstats) { }
 #endif		/* CONFIG_SMP */
 
+static inline void __zone_stat_mod_folio(struct folio *folio,
+		enum zone_stat_item item, long nr)
+{
+	__mod_zone_page_state(folio_zone(folio), item, nr);
+}
+
+static inline void __zone_stat_add_folio(struct folio *folio,
+		enum zone_stat_item item)
+{
+	__mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
+}
+
+static inline void __zone_stat_sub_folio(struct folio *folio,
+		enum zone_stat_item item)
+{
+	__mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void zone_stat_mod_folio(struct folio *folio,
+		enum zone_stat_item item, long nr)
+{
+	mod_zone_page_state(folio_zone(folio), item, nr);
+}
+
+static inline void zone_stat_add_folio(struct folio *folio,
+		enum zone_stat_item item)
+{
+	mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
+}
+
+static inline void zone_stat_sub_folio(struct folio *folio,
+		enum zone_stat_item item)
+{
+	mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void __node_stat_mod_folio(struct folio *folio,
+		enum node_stat_item item, long nr)
+{
+	__mod_node_page_state(folio_pgdat(folio), item, nr);
+}
+
+static inline void __node_stat_add_folio(struct folio *folio,
+		enum node_stat_item item)
+{
+	__mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
+}
+
+static inline void __node_stat_sub_folio(struct folio *folio,
+		enum node_stat_item item)
+{
+	__mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void node_stat_mod_folio(struct folio *folio,
+		enum node_stat_item item, long nr)
+{
+	mod_node_page_state(folio_pgdat(folio), item, nr);
+}
+
+static inline void node_stat_add_folio(struct folio *folio,
+		enum node_stat_item item)
+{
+	mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
+}
+
+static inline void node_stat_sub_folio(struct folio *folio,
+		enum node_stat_item item)
+{
+	mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
+}
+
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
 					     int migratetype)
 {
@@ -543,6 +615,24 @@ static inline void __dec_lruvec_page_state(struct page *page,
 	__mod_lruvec_page_state(page, idx, -1);
 }
 
+static inline void __lruvec_stat_mod_folio(struct folio *folio,
+					   enum node_stat_item idx, int val)
+{
+	__mod_lruvec_page_state(&folio->page, idx, val);
+}
+
+static inline void __lruvec_stat_add_folio(struct folio *folio,
+					   enum node_stat_item idx)
+{
+	__lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
+}
+
+static inline void __lruvec_stat_sub_folio(struct folio *folio,
+					   enum node_stat_item idx)
+{
+	__lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
+}
+
 static inline void inc_lruvec_page_state(struct page *page,
 					 enum node_stat_item idx)
 {
@@ -555,4 +645,21 @@ static inline void dec_lruvec_page_state(struct page *page,
 	mod_lruvec_page_state(page, idx, -1);
 }
 
+static inline void lruvec_stat_mod_folio(struct folio *folio,
+					 enum node_stat_item idx, int val)
+{
+	mod_lruvec_page_state(&folio->page, idx, val);
+}
+
+static inline void lruvec_stat_add_folio(struct folio *folio,
+					 enum node_stat_item idx)
+{
+	lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
+}
+
+static inline void lruvec_stat_sub_folio(struct folio *folio,
+					 enum node_stat_item idx)
+{
+	lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
+}
 #endif /* _LINUX_VMSTAT_H */
-- 
cgit v1.2.3


From 9e9edb2094db7eb4c6da21e02ac885955350ecf9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Jan 2021 10:52:37 -0500
Subject: mm/debug: Add VM_BUG_ON_FOLIO() and VM_WARN_ON_ONCE_FOLIO()

These are the folio equivalents of VM_BUG_ON_PAGE and
VM_WARN_ON_ONCE_PAGE.  No change to generated code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/mmdebug.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 1935d4c72d10..d7285f8148a3 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -22,6 +22,13 @@ void dump_mm(const struct mm_struct *mm);
 			BUG();						\
 		}							\
 	} while (0)
+#define VM_BUG_ON_FOLIO(cond, folio)					\
+	do {								\
+		if (unlikely(cond)) {					\
+			dump_page(&folio->page, "VM_BUG_ON_FOLIO(" __stringify(cond)")");\
+			BUG();						\
+		}							\
+	} while (0)
 #define VM_BUG_ON_VMA(cond, vma)					\
 	do {								\
 		if (unlikely(cond)) {					\
@@ -47,6 +54,17 @@ void dump_mm(const struct mm_struct *mm);
 	}								\
 	unlikely(__ret_warn_once);					\
 })
+#define VM_WARN_ON_ONCE_FOLIO(cond, folio)	({			\
+	static bool __section(".data.once") __warned;			\
+	int __ret_warn_once = !!(cond);					\
+									\
+	if (unlikely(__ret_warn_once && !__warned)) {			\
+		dump_page(&folio->page, "VM_WARN_ON_ONCE_FOLIO(" __stringify(cond)")");\
+		__warned = true;					\
+		WARN_ON(1);						\
+	}								\
+	unlikely(__ret_warn_once);					\
+})
 
 #define VM_WARN_ON(cond) (void)WARN_ON(cond)
 #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
@@ -55,11 +73,13 @@ void dump_mm(const struct mm_struct *mm);
 #else
 #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
+#define VM_BUG_ON_FOLIO(cond, folio) VM_BUG_ON(cond)
 #define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond)
 #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE_PAGE(cond, page)  BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE_FOLIO(cond, folio)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #endif
-- 
cgit v1.2.3


From c24016ac3a629655ea164b1129816660187943c0 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 31 Mar 2021 10:39:55 -0400
Subject: mm: Add folio reference count functions

These functions mirror their page reference counterparts.  Also add
the kernel-doc to the mm-api and correct the return type of
page_ref_add_unless() to bool.  No change to generated code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/page_ref.h | 88 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 3a799de8ad52..717d53c9ddf1 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -67,9 +67,31 @@ static inline int page_ref_count(const struct page *page)
 	return atomic_read(&page->_refcount);
 }
 
+/**
+ * folio_ref_count - The reference count on this folio.
+ * @folio: The folio.
+ *
+ * The refcount is usually incremented by calls to folio_get() and
+ * decremented by calls to folio_put().  Some typical users of the
+ * folio refcount:
+ *
+ * - Each reference from a page table
+ * - The page cache
+ * - Filesystem private data
+ * - The LRU list
+ * - Pipes
+ * - Direct IO which references this page in the process address space
+ *
+ * Return: The number of references to this folio.
+ */
+static inline int folio_ref_count(const struct folio *folio)
+{
+	return page_ref_count(&folio->page);
+}
+
 static inline int page_count(const struct page *page)
 {
-	return atomic_read(&compound_head(page)->_refcount);
+	return folio_ref_count(page_folio(page));
 }
 
 static inline void set_page_count(struct page *page, int v)
@@ -79,6 +101,11 @@ static inline void set_page_count(struct page *page, int v)
 		__page_ref_set(page, v);
 }
 
+static inline void folio_set_count(struct folio *folio, int v)
+{
+	set_page_count(&folio->page, v);
+}
+
 /*
  * Setup the page count before being freed into the page allocator for
  * the first time (boot or memory hotplug)
@@ -95,6 +122,11 @@ static inline void page_ref_add(struct page *page, int nr)
 		__page_ref_mod(page, nr);
 }
 
+static inline void folio_ref_add(struct folio *folio, int nr)
+{
+	page_ref_add(&folio->page, nr);
+}
+
 static inline void page_ref_sub(struct page *page, int nr)
 {
 	atomic_sub(nr, &page->_refcount);
@@ -102,6 +134,11 @@ static inline void page_ref_sub(struct page *page, int nr)
 		__page_ref_mod(page, -nr);
 }
 
+static inline void folio_ref_sub(struct folio *folio, int nr)
+{
+	page_ref_sub(&folio->page, nr);
+}
+
 static inline int page_ref_sub_return(struct page *page, int nr)
 {
 	int ret = atomic_sub_return(nr, &page->_refcount);
@@ -111,6 +148,11 @@ static inline int page_ref_sub_return(struct page *page, int nr)
 	return ret;
 }
 
+static inline int folio_ref_sub_return(struct folio *folio, int nr)
+{
+	return page_ref_sub_return(&folio->page, nr);
+}
+
 static inline void page_ref_inc(struct page *page)
 {
 	atomic_inc(&page->_refcount);
@@ -118,6 +160,11 @@ static inline void page_ref_inc(struct page *page)
 		__page_ref_mod(page, 1);
 }
 
+static inline void folio_ref_inc(struct folio *folio)
+{
+	page_ref_inc(&folio->page);
+}
+
 static inline void page_ref_dec(struct page *page)
 {
 	atomic_dec(&page->_refcount);
@@ -125,6 +172,11 @@ static inline void page_ref_dec(struct page *page)
 		__page_ref_mod(page, -1);
 }
 
+static inline void folio_ref_dec(struct folio *folio)
+{
+	page_ref_dec(&folio->page);
+}
+
 static inline int page_ref_sub_and_test(struct page *page, int nr)
 {
 	int ret = atomic_sub_and_test(nr, &page->_refcount);
@@ -134,6 +186,11 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
 	return ret;
 }
 
+static inline int folio_ref_sub_and_test(struct folio *folio, int nr)
+{
+	return page_ref_sub_and_test(&folio->page, nr);
+}
+
 static inline int page_ref_inc_return(struct page *page)
 {
 	int ret = atomic_inc_return(&page->_refcount);
@@ -143,6 +200,11 @@ static inline int page_ref_inc_return(struct page *page)
 	return ret;
 }
 
+static inline int folio_ref_inc_return(struct folio *folio)
+{
+	return page_ref_inc_return(&folio->page);
+}
+
 static inline int page_ref_dec_and_test(struct page *page)
 {
 	int ret = atomic_dec_and_test(&page->_refcount);
@@ -152,6 +214,11 @@ static inline int page_ref_dec_and_test(struct page *page)
 	return ret;
 }
 
+static inline int folio_ref_dec_and_test(struct folio *folio)
+{
+	return page_ref_dec_and_test(&folio->page);
+}
+
 static inline int page_ref_dec_return(struct page *page)
 {
 	int ret = atomic_dec_return(&page->_refcount);
@@ -161,6 +228,11 @@ static inline int page_ref_dec_return(struct page *page)
 	return ret;
 }
 
+static inline int folio_ref_dec_return(struct folio *folio)
+{
+	return page_ref_dec_return(&folio->page);
+}
+
 static inline bool page_ref_add_unless(struct page *page, int nr, int u)
 {
 	bool ret = atomic_add_unless(&page->_refcount, nr, u);
@@ -170,6 +242,11 @@ static inline bool page_ref_add_unless(struct page *page, int nr, int u)
 	return ret;
 }
 
+static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u)
+{
+	return page_ref_add_unless(&folio->page, nr, u);
+}
+
 static inline int page_ref_freeze(struct page *page, int count)
 {
 	int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
@@ -179,6 +256,11 @@ static inline int page_ref_freeze(struct page *page, int count)
 	return ret;
 }
 
+static inline int folio_ref_freeze(struct folio *folio, int count)
+{
+	return page_ref_freeze(&folio->page, count);
+}
+
 static inline void page_ref_unfreeze(struct page *page, int count)
 {
 	VM_BUG_ON_PAGE(page_count(page) != 0, page);
@@ -189,4 +271,8 @@ static inline void page_ref_unfreeze(struct page *page, int count)
 		__page_ref_unfreeze(page, count);
 }
 
+static inline void folio_ref_unfreeze(struct folio *folio, int count)
+{
+	page_ref_unfreeze(&folio->page, count);
+}
 #endif
-- 
cgit v1.2.3


From b620f63358cd35c8e9084ee9fc460153f64714f6 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 6 Dec 2020 23:04:57 -0500
Subject: mm: Add folio_put()

If we know we have a folio, we can call folio_put() instead of put_page()
and save the overhead of calling compound_head().  Also skips the
devmap checks.

This commit looks like it should be a no-op, but actually saves 684 bytes
of text with the distro-derived config that I'm testing.  Some functions
grow a little while others shrink.  I presume the compiler is making
different inlining decisions.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/mm.h | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3fc524f8c2a2..28a84d82247b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -748,6 +748,11 @@ static inline int put_page_testzero(struct page *page)
 	return page_ref_dec_and_test(page);
 }
 
+static inline int folio_put_testzero(struct folio *folio)
+{
+	return put_page_testzero(&folio->page);
+}
+
 /*
  * Try to grab a ref unless the page has a refcount of zero, return false if
  * that is the case.
@@ -1247,9 +1252,28 @@ static inline __must_check bool try_get_page(struct page *page)
 	return true;
 }
 
+/**
+ * folio_put - Decrement the reference count on a folio.
+ * @folio: The folio.
+ *
+ * If the folio's reference count reaches zero, the memory will be
+ * released back to the page allocator and may be used by another
+ * allocation immediately.  Do not access the memory or the struct folio
+ * after calling folio_put() unless you can be sure that it wasn't the
+ * last reference.
+ *
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context.  May be called while holding a spinlock.
+ */
+static inline void folio_put(struct folio *folio)
+{
+	if (folio_put_testzero(folio))
+		__put_page(&folio->page);
+}
+
 static inline void put_page(struct page *page)
 {
-	page = compound_head(page);
+	struct folio *folio = page_folio(page);
 
 	/*
 	 * For devmap managed pages we need to catch refcount transition from
@@ -1257,13 +1281,12 @@ static inline void put_page(struct page *page)
 	 * need to inform the device driver through callback. See
 	 * include/linux/memremap.h and HMM for details.
 	 */
-	if (page_is_devmap_managed(page)) {
-		put_devmap_managed_page(page);
+	if (page_is_devmap_managed(&folio->page)) {
+		put_devmap_managed_page(&folio->page);
 		return;
 	}
 
-	if (put_page_testzero(page))
-		__put_page(page);
+	folio_put(folio);
 }
 
 /*
-- 
cgit v1.2.3


From 86d234cb0499c6466ccfc45f6501bc0cd4621c60 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 6 Dec 2020 23:04:57 -0500
Subject: mm: Add folio_get()

If we know we have a folio, we can call folio_get() instead
of get_page() and save the overhead of calling compound_head().
No change to generated code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/mm.h | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 28a84d82247b..63685d953ce0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1224,18 +1224,26 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
 }
 
 /* 127: arbitrary random number, small enough to assemble well */
-#define page_ref_zero_or_close_to_overflow(page) \
-	((unsigned int) page_ref_count(page) + 127u <= 127u)
+#define folio_ref_zero_or_close_to_overflow(folio) \
+	((unsigned int) folio_ref_count(folio) + 127u <= 127u)
+
+/**
+ * folio_get - Increment the reference count on a folio.
+ * @folio: The folio.
+ *
+ * Context: May be called in any context, as long as you know that
+ * you have a refcount on the folio.  If you do not already have one,
+ * folio_try_get() may be the right interface for you to use.
+ */
+static inline void folio_get(struct folio *folio)
+{
+	VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
+	folio_ref_inc(folio);
+}
 
 static inline void get_page(struct page *page)
 {
-	page = compound_head(page);
-	/*
-	 * Getting a normal page or the head of a compound page
-	 * requires to already have an elevated page->_refcount.
-	 */
-	VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
-	page_ref_inc(page);
+	folio_get(page_folio(page));
 }
 
 bool __must_check try_grab_page(struct page *page, unsigned int flags);
-- 
cgit v1.2.3


From 020853b6f5ead2849b055e9873ff7267ce584256 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 10 May 2021 16:33:22 -0400
Subject: mm: Add folio_try_get_rcu()

This is the equivalent of page_cache_get_speculative().  Also add
folio_ref_try_add_rcu (the equivalent of page_cache_add_speculative)
and folio_get_unless_zero() (the equivalent of get_page_unless_zero()).

The new kernel-doc attempts to explain from the user's point of view
when to use folio_try_get_rcu() and when to use folio_get_unless_zero(),
because there seems to be some confusion currently between the users of
page_cache_get_speculative() and get_page_unless_zero().

Reimplement page_cache_add_speculative() and page_cache_get_speculative()
as wrappers around the folio equivalents, but leave get_page_unless_zero()
alone for now.  This commit reduces text size by 3 bytes due to slightly
different register allocation & instruction selections.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/page_ref.h | 66 +++++++++++++++++++++++++++++++++++++
 include/linux/pagemap.h  | 84 +++---------------------------------------------
 2 files changed, 70 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 717d53c9ddf1..2e677e6ad09f 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -247,6 +247,72 @@ static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u)
 	return page_ref_add_unless(&folio->page, nr, u);
 }
 
+/**
+ * folio_try_get - Attempt to increase the refcount on a folio.
+ * @folio: The folio.
+ *
+ * If you do not already have a reference to a folio, you can attempt to
+ * get one using this function.  It may fail if, for example, the folio
+ * has been freed since you found a pointer to it, or it is frozen for
+ * the purposes of splitting or migration.
+ *
+ * Return: True if the reference count was successfully incremented.
+ */
+static inline bool folio_try_get(struct folio *folio)
+{
+	return folio_ref_add_unless(folio, 1, 0);
+}
+
+static inline bool folio_ref_try_add_rcu(struct folio *folio, int count)
+{
+#ifdef CONFIG_TINY_RCU
+	/*
+	 * The caller guarantees the folio will not be freed from interrupt
+	 * context, so (on !SMP) we only need preemption to be disabled
+	 * and TINY_RCU does that for us.
+	 */
+# ifdef CONFIG_PREEMPT_COUNT
+	VM_BUG_ON(!in_atomic() && !irqs_disabled());
+# endif
+	VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio);
+	folio_ref_add(folio, count);
+#else
+	if (unlikely(!folio_ref_add_unless(folio, count, 0))) {
+		/* Either the folio has been freed, or will be freed. */
+		return false;
+	}
+#endif
+	return true;
+}
+
+/**
+ * folio_try_get_rcu - Attempt to increase the refcount on a folio.
+ * @folio: The folio.
+ *
+ * This is a version of folio_try_get() optimised for non-SMP kernels.
+ * If you are still holding the rcu_read_lock() after looking up the
+ * page and know that the page cannot have its refcount decreased to
+ * zero in interrupt context, you can use this instead of folio_try_get().
+ *
+ * Example users include get_user_pages_fast() (as pages are not unmapped
+ * from interrupt context) and the page cache lookups (as pages are not
+ * truncated from interrupt context).  We also know that pages are not
+ * frozen in interrupt context for the purposes of splitting or migration.
+ *
+ * You can also use this function if you're holding a lock that prevents
+ * pages being frozen & removed; eg the i_pages lock for the page cache
+ * or the mmap_sem or page table lock for page tables.  In this case,
+ * it will always succeed, and you could have used a plain folio_get(),
+ * but it's sometimes more convenient to have a common function called
+ * from both locked and RCU-protected contexts.
+ *
+ * Return: True if the reference count was successfully incremented.
+ */
+static inline bool folio_try_get_rcu(struct folio *folio)
+{
+	return folio_ref_try_add_rcu(folio, 1);
+}
+
 static inline int page_ref_freeze(struct page *page, int count)
 {
 	int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 62db6b0176b9..f3ec26551082 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -172,91 +172,15 @@ static inline struct address_space *page_mapping_file(struct page *page)
 	return page_mapping(page);
 }
 
-/*
- * speculatively take a reference to a page.
- * If the page is free (_refcount == 0), then _refcount is untouched, and 0
- * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
- *
- * This function must be called inside the same rcu_read_lock() section as has
- * been used to lookup the page in the pagecache radix-tree (or page table):
- * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
- *
- * Unless an RCU grace period has passed, the count of all pages coming out
- * of the allocator must be considered unstable. page_count may return higher
- * than expected, and put_page must be able to do the right thing when the
- * page has been finished with, no matter what it is subsequently allocated
- * for (because put_page is what is used here to drop an invalid speculative
- * reference).
- *
- * This is the interesting part of the lockless pagecache (and lockless
- * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page)
- * has the following pattern:
- * 1. find page in radix tree
- * 2. conditionally increment refcount
- * 3. check the page is still in pagecache (if no, goto 1)
- *
- * Remove-side that cares about stability of _refcount (eg. reclaim) has the
- * following (with the i_pages lock held):
- * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
- * B. remove page from pagecache
- * C. free the page
- *
- * There are 2 critical interleavings that matter:
- * - 2 runs before A: in this case, A sees elevated refcount and bails out
- * - A runs before 2: in this case, 2 sees zero refcount and retries;
- *   subsequently, B will complete and 1 will find no page, causing the
- *   lookup to return NULL.
- *
- * It is possible that between 1 and 2, the page is removed then the exact same
- * page is inserted into the same position in pagecache. That's OK: the
- * old find_get_page using a lock could equally have run before or after
- * such a re-insertion, depending on order that locks are granted.
- *
- * Lookups racing against pagecache insertion isn't a big problem: either 1
- * will find the page or it will not. Likewise, the old find_get_page could run
- * either before the insertion or afterwards, depending on timing.
- */
-static inline int __page_cache_add_speculative(struct page *page, int count)
+static inline bool page_cache_add_speculative(struct page *page, int count)
 {
-#ifdef CONFIG_TINY_RCU
-# ifdef CONFIG_PREEMPT_COUNT
-	VM_BUG_ON(!in_atomic() && !irqs_disabled());
-# endif
-	/*
-	 * Preempt must be disabled here - we rely on rcu_read_lock doing
-	 * this for us.
-	 *
-	 * Pagecache won't be truncated from interrupt context, so if we have
-	 * found a page in the radix tree here, we have pinned its refcount by
-	 * disabling preempt, and hence no need for the "speculative get" that
-	 * SMP requires.
-	 */
-	VM_BUG_ON_PAGE(page_count(page) == 0, page);
-	page_ref_add(page, count);
-
-#else
-	if (unlikely(!page_ref_add_unless(page, count, 0))) {
-		/*
-		 * Either the page has been freed, or will be freed.
-		 * In either case, retry here and the caller should
-		 * do the right thing (see comments above).
-		 */
-		return 0;
-	}
-#endif
 	VM_BUG_ON_PAGE(PageTail(page), page);
-
-	return 1;
-}
-
-static inline int page_cache_get_speculative(struct page *page)
-{
-	return __page_cache_add_speculative(page, 1);
+	return folio_ref_try_add_rcu((struct folio *)page, count);
 }
 
-static inline int page_cache_add_speculative(struct page *page, int count)
+static inline bool page_cache_get_speculative(struct page *page)
 {
-	return __page_cache_add_speculative(page, count);
+	return page_cache_add_speculative(page, 1);
 }
 
 /**
-- 
cgit v1.2.3


From d389a4a8115518e2cc232649b012b72113fe8b67 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 7 Dec 2020 15:42:09 -0500
Subject: mm: Add folio flag manipulation functions

These new functions are the folio analogues of the various PageFlags
functions.  If CONFIG_DEBUG_VM_PGFLAGS is enabled, we check the folio
is not a tail page at every invocation.  This will also catch the
PagePoisoned case as a poisoned page has every bit set, which would
include PageTail.

This saves 1684 bytes of text with the distro-derived config that
I'm testing due to removing a double call to compound_head() in
PageSwapCache().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/page-flags.h | 219 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 156 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e9b0723a637d..2491e84b8e52 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -143,6 +143,8 @@ enum pageflags {
 #endif
 	__NR_PAGEFLAGS,
 
+	PG_readahead = PG_reclaim,
+
 	/* Filesystems */
 	PG_checked = PG_owner_priv_1,
 
@@ -245,6 +247,15 @@ static inline void page_init_poison(struct page *page, size_t size)
 }
 #endif
 
+static unsigned long *folio_flags(struct folio *folio, unsigned n)
+{
+	struct page *page = &folio->page;
+
+	VM_BUG_ON_PGFLAGS(PageTail(page), page);
+	VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
+	return &page[n].flags;
+}
+
 /*
  * Page flags policies wrt compound pages
  *
@@ -289,36 +300,64 @@ static inline void page_init_poison(struct page *page, size_t size)
 		VM_BUG_ON_PGFLAGS(!PageHead(page), page);		\
 		PF_POISONED_CHECK(&page[1]); })
 
+/* Which page is the flag stored in */
+#define FOLIO_PF_ANY		0
+#define FOLIO_PF_HEAD		0
+#define FOLIO_PF_ONLY_HEAD	0
+#define FOLIO_PF_NO_TAIL	0
+#define FOLIO_PF_NO_COMPOUND	0
+#define FOLIO_PF_SECOND		1
+
 /*
  * Macros to create function definitions for page flags
  */
 #define TESTPAGEFLAG(uname, lname, policy)				\
+static __always_inline bool folio_test_##lname(struct folio *folio)	\
+{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }	\
 static __always_inline int Page##uname(struct page *page)		\
-	{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
+{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
 
 #define SETPAGEFLAG(uname, lname, policy)				\
+static __always_inline							\
+void folio_set_##lname(struct folio *folio)				\
+{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }		\
 static __always_inline void SetPage##uname(struct page *page)		\
-	{ set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ set_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define CLEARPAGEFLAG(uname, lname, policy)				\
+static __always_inline							\
+void folio_clear_##lname(struct folio *folio)				\
+{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }		\
 static __always_inline void ClearPage##uname(struct page *page)		\
-	{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define __SETPAGEFLAG(uname, lname, policy)				\
+static __always_inline							\
+void __folio_set_##lname(struct folio *folio)				\
+{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }		\
 static __always_inline void __SetPage##uname(struct page *page)		\
-	{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define __CLEARPAGEFLAG(uname, lname, policy)				\
+static __always_inline							\
+void __folio_clear_##lname(struct folio *folio)				\
+{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }	\
 static __always_inline void __ClearPage##uname(struct page *page)	\
-	{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define TESTSETFLAG(uname, lname, policy)				\
+static __always_inline							\
+bool folio_test_set_##lname(struct folio *folio)			\
+{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
 static __always_inline int TestSetPage##uname(struct page *page)	\
-	{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define TESTCLEARFLAG(uname, lname, policy)				\
+static __always_inline							\
+bool folio_test_clear_##lname(struct folio *folio)			\
+{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
 static __always_inline int TestClearPage##uname(struct page *page)	\
-	{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define PAGEFLAG(uname, lname, policy)					\
 	TESTPAGEFLAG(uname, lname, policy)				\
@@ -334,29 +373,37 @@ static __always_inline int TestClearPage##uname(struct page *page)	\
 	TESTSETFLAG(uname, lname, policy)				\
 	TESTCLEARFLAG(uname, lname, policy)
 
-#define TESTPAGEFLAG_FALSE(uname)					\
+#define TESTPAGEFLAG_FALSE(uname, lname)				\
+static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \
 static inline int Page##uname(const struct page *page) { return 0; }
 
-#define SETPAGEFLAG_NOOP(uname)						\
+#define SETPAGEFLAG_NOOP(uname, lname)					\
+static inline void folio_set_##lname(struct folio *folio) { }		\
 static inline void SetPage##uname(struct page *page) {  }
 
-#define CLEARPAGEFLAG_NOOP(uname)					\
+#define CLEARPAGEFLAG_NOOP(uname, lname)				\
+static inline void folio_clear_##lname(struct folio *folio) { }		\
 static inline void ClearPage##uname(struct page *page) {  }
 
-#define __CLEARPAGEFLAG_NOOP(uname)					\
+#define __CLEARPAGEFLAG_NOOP(uname, lname)				\
+static inline void __folio_clear_##lname(struct folio *folio) { }	\
 static inline void __ClearPage##uname(struct page *page) {  }
 
-#define TESTSETFLAG_FALSE(uname)					\
+#define TESTSETFLAG_FALSE(uname, lname)					\
+static inline bool folio_test_set_##lname(struct folio *folio)		\
+{ return 0; }								\
 static inline int TestSetPage##uname(struct page *page) { return 0; }
 
-#define TESTCLEARFLAG_FALSE(uname)					\
+#define TESTCLEARFLAG_FALSE(uname, lname)				\
+static inline bool folio_test_clear_##lname(struct folio *folio)	\
+{ return 0; }								\
 static inline int TestClearPage##uname(struct page *page) { return 0; }
 
-#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname)			\
-	SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)
+#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname)	\
+	SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname)
 
-#define TESTSCFLAG_FALSE(uname)						\
-	TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
+#define TESTSCFLAG_FALSE(uname, lname)					\
+	TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname)
 
 __PAGEFLAG(Locked, locked, PF_NO_TAIL)
 PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
@@ -412,8 +459,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
 PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
 	TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
-PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
-	TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND)
+	TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND)
 
 #ifdef CONFIG_HIGHMEM
 /*
@@ -422,22 +469,25 @@ PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
  */
 #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
 #else
-PAGEFLAG_FALSE(HighMem)
+PAGEFLAG_FALSE(HighMem, highmem)
 #endif
 
 #ifdef CONFIG_SWAP
-static __always_inline int PageSwapCache(struct page *page)
+static __always_inline bool folio_test_swapcache(struct folio *folio)
 {
-#ifdef CONFIG_THP_SWAP
-	page = compound_head(page);
-#endif
-	return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags);
+	return folio_test_swapbacked(folio) &&
+			test_bit(PG_swapcache, folio_flags(folio, 0));
+}
 
+static __always_inline bool PageSwapCache(struct page *page)
+{
+	return folio_test_swapcache(page_folio(page));
 }
+
 SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
 CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
 #else
-PAGEFLAG_FALSE(SwapCache)
+PAGEFLAG_FALSE(SwapCache, swapcache)
 #endif
 
 PAGEFLAG(Unevictable, unevictable, PF_HEAD)
@@ -449,14 +499,14 @@ PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
 	__CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
 	TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
 #else
-PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
-	TESTSCFLAG_FALSE(Mlocked)
+PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked)
+	TESTSCFLAG_FALSE(Mlocked, mlocked)
 #endif
 
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
 #else
-PAGEFLAG_FALSE(Uncached)
+PAGEFLAG_FALSE(Uncached, uncached)
 #endif
 
 #ifdef CONFIG_MEMORY_FAILURE
@@ -465,7 +515,7 @@ TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
 #define __PG_HWPOISON (1UL << PG_hwpoison)
 extern bool take_page_off_buddy(struct page *page);
 #else
-PAGEFLAG_FALSE(HWPoison)
+PAGEFLAG_FALSE(HWPoison, hwpoison)
 #define __PG_HWPOISON 0
 #endif
 
@@ -479,7 +529,7 @@ PAGEFLAG(Idle, idle, PF_ANY)
 #ifdef CONFIG_KASAN_HW_TAGS
 PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD)
 #else
-PAGEFLAG_FALSE(SkipKASanPoison)
+PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
 #endif
 
 /*
@@ -517,10 +567,14 @@ static __always_inline int PageMappingFlags(struct page *page)
 	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
 }
 
-static __always_inline int PageAnon(struct page *page)
+static __always_inline bool folio_test_anon(struct folio *folio)
+{
+	return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
+static __always_inline bool PageAnon(struct page *page)
 {
-	page = compound_head(page);
-	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+	return folio_test_anon(page_folio(page));
 }
 
 static __always_inline int __PageMovable(struct page *page)
@@ -536,30 +590,32 @@ static __always_inline int __PageMovable(struct page *page)
  * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
  * anon_vma, but to that page's node of the stable tree.
  */
-static __always_inline int PageKsm(struct page *page)
+static __always_inline bool folio_test_ksm(struct folio *folio)
 {
-	page = compound_head(page);
-	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
+	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
 				PAGE_MAPPING_KSM;
 }
+
+static __always_inline bool PageKsm(struct page *page)
+{
+	return folio_test_ksm(page_folio(page));
+}
 #else
-TESTPAGEFLAG_FALSE(Ksm)
+TESTPAGEFLAG_FALSE(Ksm, ksm)
 #endif
 
 u64 stable_page_flags(struct page *page);
 
-static inline int PageUptodate(struct page *page)
+static inline bool folio_test_uptodate(struct folio *folio)
 {
-	int ret;
-	page = compound_head(page);
-	ret = test_bit(PG_uptodate, &(page)->flags);
+	bool ret = test_bit(PG_uptodate, folio_flags(folio, 0));
 	/*
-	 * Must ensure that the data we read out of the page is loaded
-	 * _after_ we've loaded page->flags to check for PageUptodate.
-	 * We can skip the barrier if the page is not uptodate, because
+	 * Must ensure that the data we read out of the folio is loaded
+	 * _after_ we've loaded folio->flags to check the uptodate bit.
+	 * We can skip the barrier if the folio is not uptodate, because
 	 * we wouldn't be reading anything from it.
 	 *
-	 * See SetPageUptodate() for the other side of the story.
+	 * See folio_mark_uptodate() for the other side of the story.
 	 */
 	if (ret)
 		smp_rmb();
@@ -567,23 +623,36 @@ static inline int PageUptodate(struct page *page)
 	return ret;
 }
 
-static __always_inline void __SetPageUptodate(struct page *page)
+static inline int PageUptodate(struct page *page)
+{
+	return folio_test_uptodate(page_folio(page));
+}
+
+static __always_inline void __folio_mark_uptodate(struct folio *folio)
 {
-	VM_BUG_ON_PAGE(PageTail(page), page);
 	smp_wmb();
-	__set_bit(PG_uptodate, &page->flags);
+	__set_bit(PG_uptodate, folio_flags(folio, 0));
 }
 
-static __always_inline void SetPageUptodate(struct page *page)
+static __always_inline void folio_mark_uptodate(struct folio *folio)
 {
-	VM_BUG_ON_PAGE(PageTail(page), page);
 	/*
 	 * Memory barrier must be issued before setting the PG_uptodate bit,
-	 * so that all previous stores issued in order to bring the page
-	 * uptodate are actually visible before PageUptodate becomes true.
+	 * so that all previous stores issued in order to bring the folio
+	 * uptodate are actually visible before folio_test_uptodate becomes true.
 	 */
 	smp_wmb();
-	set_bit(PG_uptodate, &page->flags);
+	set_bit(PG_uptodate, folio_flags(folio, 0));
+}
+
+static __always_inline void __SetPageUptodate(struct page *page)
+{
+	__folio_mark_uptodate((struct folio *)page);
+}
+
+static __always_inline void SetPageUptodate(struct page *page)
+{
+	folio_mark_uptodate((struct folio *)page);
 }
 
 CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
@@ -608,6 +677,17 @@ static inline void set_page_writeback_keepwrite(struct page *page)
 
 __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
 
+/* Whether there are one or multiple pages in a folio */
+static inline bool folio_test_single(struct folio *folio)
+{
+	return !folio_test_head(folio);
+}
+
+static inline bool folio_test_multi(struct folio *folio)
+{
+	return folio_test_head(folio);
+}
+
 static __always_inline void set_compound_head(struct page *page, struct page *head)
 {
 	WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
@@ -631,12 +711,15 @@ static inline void ClearPageCompound(struct page *page)
 #ifdef CONFIG_HUGETLB_PAGE
 int PageHuge(struct page *page);
 int PageHeadHuge(struct page *page);
+static inline bool folio_test_hugetlb(struct folio *folio)
+{
+	return PageHeadHuge(&folio->page);
+}
 #else
-TESTPAGEFLAG_FALSE(Huge)
-TESTPAGEFLAG_FALSE(HeadHuge)
+TESTPAGEFLAG_FALSE(Huge, hugetlb)
+TESTPAGEFLAG_FALSE(HeadHuge, headhuge)
 #endif
 
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * PageHuge() only returns true for hugetlbfs pages, but not for
@@ -652,6 +735,11 @@ static inline int PageTransHuge(struct page *page)
 	return PageHead(page);
 }
 
+static inline bool folio_test_transhuge(struct folio *folio)
+{
+	return folio_test_head(folio);
+}
+
 /*
  * PageTransCompound returns true for both transparent huge pages
  * and hugetlbfs pages, so it should only be called when it's known
@@ -688,12 +776,12 @@ static inline int PageTransTail(struct page *page)
 PAGEFLAG(DoubleMap, double_map, PF_SECOND)
 	TESTSCFLAG(DoubleMap, double_map, PF_SECOND)
 #else
-TESTPAGEFLAG_FALSE(TransHuge)
-TESTPAGEFLAG_FALSE(TransCompound)
-TESTPAGEFLAG_FALSE(TransCompoundMap)
-TESTPAGEFLAG_FALSE(TransTail)
-PAGEFLAG_FALSE(DoubleMap)
-	TESTSCFLAG_FALSE(DoubleMap)
+TESTPAGEFLAG_FALSE(TransHuge, transhuge)
+TESTPAGEFLAG_FALSE(TransCompound, transcompound)
+TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap)
+TESTPAGEFLAG_FALSE(TransTail, transtail)
+PAGEFLAG_FALSE(DoubleMap, double_map)
+	TESTSCFLAG_FALSE(DoubleMap, double_map)
 #endif
 
 /*
@@ -877,6 +965,11 @@ static inline int page_has_private(struct page *page)
 	return !!(page->flags & PAGE_FLAGS_PRIVATE);
 }
 
+static inline bool folio_has_private(struct folio *folio)
+{
+	return page_has_private(&folio->page);
+}
+
 #undef PF_ANY
 #undef PF_HEAD
 #undef PF_ONLY_HEAD
-- 
cgit v1.2.3


From 889a3747b3b7661b089ba4eae081a3b6bb351a23 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 09:47:41 -0500
Subject: mm/lru: Add folio LRU functions

Handle arbitrary-order folios being added to the LRU.  By definition,
all pages being added to the LRU were already head or base pages, but
call page_folio() on them anyway to get the type right and avoid the
buried calls to compound_head().

Saves 783 bytes of kernel text; no functions grow.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm_inline.h | 103 ++++++++++++++++++++++++++++++----------------
 1 file changed, 67 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 355ea1ee32bd..e2ec68b0515c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -6,27 +6,33 @@
 #include <linux/swap.h>
 
 /**
- * page_is_file_lru - should the page be on a file LRU or anon LRU?
- * @page: the page to test
- *
- * Returns 1 if @page is a regular filesystem backed page cache page or a lazily
- * freed anonymous page (e.g. via MADV_FREE).  Returns 0 if @page is a normal
- * anonymous page, a tmpfs page or otherwise ram or swap backed page.  Used by
- * functions that manipulate the LRU lists, to sort a page onto the right LRU
- * list.
+ * folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
+ * @folio: The folio to test.
  *
  * We would like to get this info without a page flag, but the state
- * needs to survive until the page is last deleted from the LRU, which
+ * needs to survive until the folio is last deleted from the LRU, which
  * could be as far down as __page_cache_release.
+ *
+ * Return: An integer (not a boolean!) used to sort a folio onto the
+ * right LRU list and to account folios correctly.
+ * 1 if @folio is a regular filesystem backed page cache folio
+ * or a lazily freed anonymous folio (e.g. via MADV_FREE).
+ * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
+ * ram or swap backed folio.
  */
+static inline int folio_is_file_lru(struct folio *folio)
+{
+	return !folio_test_swapbacked(folio);
+}
+
 static inline int page_is_file_lru(struct page *page)
 {
-	return !PageSwapBacked(page);
+	return folio_is_file_lru(page_folio(page));
 }
 
 static __always_inline void update_lru_size(struct lruvec *lruvec,
 				enum lru_list lru, enum zone_type zid,
-				int nr_pages)
+				long nr_pages)
 {
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
@@ -39,69 +45,94 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
 }
 
 /**
- * __clear_page_lru_flags - clear page lru flags before releasing a page
- * @page: the page that was on lru and now has a zero reference
+ * __folio_clear_lru_flags - Clear page lru flags before releasing a page.
+ * @folio: The folio that was on lru and now has a zero reference.
  */
-static __always_inline void __clear_page_lru_flags(struct page *page)
+static __always_inline void __folio_clear_lru_flags(struct folio *folio)
 {
-	VM_BUG_ON_PAGE(!PageLRU(page), page);
+	VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio);
 
-	__ClearPageLRU(page);
+	__folio_clear_lru(folio);
 
 	/* this shouldn't happen, so leave the flags to bad_page() */
-	if (PageActive(page) && PageUnevictable(page))
+	if (folio_test_active(folio) && folio_test_unevictable(folio))
 		return;
 
-	__ClearPageActive(page);
-	__ClearPageUnevictable(page);
+	__folio_clear_active(folio);
+	__folio_clear_unevictable(folio);
+}
+
+static __always_inline void __clear_page_lru_flags(struct page *page)
+{
+	__folio_clear_lru_flags(page_folio(page));
 }
 
 /**
- * page_lru - which LRU list should a page be on?
- * @page: the page to test
+ * folio_lru_list - Which LRU list should a folio be on?
+ * @folio: The folio to test.
  *
- * Returns the LRU list a page should be on, as an index
+ * Return: The LRU list a folio should be on, as an index
  * into the array of LRU lists.
  */
-static __always_inline enum lru_list page_lru(struct page *page)
+static __always_inline enum lru_list folio_lru_list(struct folio *folio)
 {
 	enum lru_list lru;
 
-	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+	VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
 
-	if (PageUnevictable(page))
+	if (folio_test_unevictable(folio))
 		return LRU_UNEVICTABLE;
 
-	lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
-	if (PageActive(page))
+	lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
+	if (folio_test_active(folio))
 		lru += LRU_ACTIVE;
 
 	return lru;
 }
 
+static __always_inline
+void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
+{
+	enum lru_list lru = folio_lru_list(folio);
+
+	update_lru_size(lruvec, lru, folio_zonenum(folio),
+			folio_nr_pages(folio));
+	list_add(&folio->lru, &lruvec->lists[lru]);
+}
+
 static __always_inline void add_page_to_lru_list(struct page *page,
 				struct lruvec *lruvec)
 {
-	enum lru_list lru = page_lru(page);
+	lruvec_add_folio(lruvec, page_folio(page));
+}
 
-	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
-	list_add(&page->lru, &lruvec->lists[lru]);
+static __always_inline
+void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
+{
+	enum lru_list lru = folio_lru_list(folio);
+
+	update_lru_size(lruvec, lru, folio_zonenum(folio),
+			folio_nr_pages(folio));
+	list_add_tail(&folio->lru, &lruvec->lists[lru]);
 }
 
 static __always_inline void add_page_to_lru_list_tail(struct page *page,
 				struct lruvec *lruvec)
 {
-	enum lru_list lru = page_lru(page);
+	lruvec_add_folio_tail(lruvec, page_folio(page));
+}
 
-	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
-	list_add_tail(&page->lru, &lruvec->lists[lru]);
+static __always_inline
+void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
+{
+	list_del(&folio->lru);
+	update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio),
+			-folio_nr_pages(folio));
 }
 
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec)
 {
-	list_del(&page->lru);
-	update_lru_size(lruvec, page_lru(page), page_zonenum(page),
-			-thp_nr_pages(page));
+	lruvec_del_folio(lruvec, page_folio(page));
 }
 #endif
-- 
cgit v1.2.3


From 85d0a2ed3747da7f9aedacb72478bbedf06f9f2e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 11 Jan 2021 10:04:40 -0500
Subject: mm: Handle per-folio private data

Add folio_get_private() which mirrors page_private() -- ie folio private
data is the same as page private data.  The only difference is that these
return a void * instead of an unsigned long, which matches the majority
of users.

Turn attach_page_private() into folio_attach_private() and reimplement
attach_page_private() as a wrapper.  No filesystem which uses page private
data currently supports compound pages, so we're free to define the rules.
attach_page_private() may only be called on a head page; if you want
to add private data to a tail page, you can call set_page_private()
directly (and shouldn't increment the page refcount!  That should be
done when adding private data to the head page / folio).

This saves 813 bytes of text with the distro-derived config that I'm
testing due to removing the calls to compound_head() in get_page()
& put_page().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/mm_types.h | 11 +++++++++++
 include/linux/pagemap.h  | 48 +++++++++++++++++++++++++++++-------------------
 2 files changed, 40 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6908186629b4..5ebcb86ac934 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -317,6 +317,12 @@ static inline atomic_t *compound_pincount_ptr(struct page *page)
 #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
 #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
 
+/*
+ * page_private can be used on tail pages.  However, PagePrivate is only
+ * checked by the VM on the head page.  So page_private on the tail pages
+ * should be used for data that's ancillary to the head page (eg attaching
+ * buffer heads to tail pages after attaching buffer heads to the head page)
+ */
 #define page_private(page)		((page)->private)
 
 static inline void set_page_private(struct page *page, unsigned long private)
@@ -324,6 +330,11 @@ static inline void set_page_private(struct page *page, unsigned long private)
 	page->private = private;
 }
 
+static inline void *folio_get_private(struct folio *folio)
+{
+	return folio->private;
+}
+
 struct page_frag_cache {
 	void * va;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index f3ec26551082..b0f5bf1cb540 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -184,42 +184,52 @@ static inline bool page_cache_get_speculative(struct page *page)
 }
 
 /**
- * attach_page_private - Attach private data to a page.
- * @page: Page to attach data to.
- * @data: Data to attach to page.
+ * folio_attach_private - Attach private data to a folio.
+ * @folio: Folio to attach data to.
+ * @data: Data to attach to folio.
  *
- * Attaching private data to a page increments the page's reference count.
- * The data must be detached before the page will be freed.
+ * Attaching private data to a folio increments the page's reference count.
+ * The data must be detached before the folio will be freed.
  */
-static inline void attach_page_private(struct page *page, void *data)
+static inline void folio_attach_private(struct folio *folio, void *data)
 {
-	get_page(page);
-	set_page_private(page, (unsigned long)data);
-	SetPagePrivate(page);
+	folio_get(folio);
+	folio->private = data;
+	folio_set_private(folio);
 }
 
 /**
- * detach_page_private - Detach private data from a page.
- * @page: Page to detach data from.
+ * folio_detach_private - Detach private data from a folio.
+ * @folio: Folio to detach data from.
  *
- * Removes the data that was previously attached to the page and decrements
+ * Removes the data that was previously attached to the folio and decrements
  * the refcount on the page.
  *
- * Return: Data that was attached to the page.
+ * Return: Data that was attached to the folio.
  */
-static inline void *detach_page_private(struct page *page)
+static inline void *folio_detach_private(struct folio *folio)
 {
-	void *data = (void *)page_private(page);
+	void *data = folio_get_private(folio);
 
-	if (!PagePrivate(page))
+	if (!folio_test_private(folio))
 		return NULL;
-	ClearPagePrivate(page);
-	set_page_private(page, 0);
-	put_page(page);
+	folio_clear_private(folio);
+	folio->private = NULL;
+	folio_put(folio);
 
 	return data;
 }
 
+static inline void attach_page_private(struct page *page, void *data)
+{
+	folio_attach_private(page_folio(page), data);
+}
+
+static inline void *detach_page_private(struct page *page)
+{
+	return folio_detach_private(page_folio(page));
+}
+
 #ifdef CONFIG_NUMA
 extern struct page *__page_cache_alloc(gfp_t gfp);
 #else
-- 
cgit v1.2.3


From 9257e15677384d1ccdfe0619c4455e34cb0342c7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Jan 2021 23:39:21 -0500
Subject: mm/filemap: Add folio_index(), folio_file_page() and folio_contains()

folio_index() is the equivalent of page_index() for folios.
folio_file_page() is the equivalent of find_subpage().
folio_contains() is the equivalent of thp_contains().

No changes to generated code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/pagemap.h | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b0f5bf1cb540..e7b46223239a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -386,6 +386,62 @@ static inline bool thp_contains(struct page *head, pgoff_t index)
 	return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL));
 }
 
+#define swapcache_index(folio)	__page_file_index(&(folio)->page)
+
+/**
+ * folio_index - File index of a folio.
+ * @folio: The folio.
+ *
+ * For a folio which is either in the page cache or the swap cache,
+ * return its index within the address_space it belongs to.  If you know
+ * the page is definitely in the page cache, you can look at the folio's
+ * index directly.
+ *
+ * Return: The index (offset in units of pages) of a folio in its file.
+ */
+static inline pgoff_t folio_index(struct folio *folio)
+{
+        if (unlikely(folio_test_swapcache(folio)))
+                return swapcache_index(folio);
+        return folio->index;
+}
+
+/**
+ * folio_file_page - The page for a particular index.
+ * @folio: The folio which contains this index.
+ * @index: The index we want to look up.
+ *
+ * Sometimes after looking up a folio in the page cache, we need to
+ * obtain the specific page for an index (eg a page fault).
+ *
+ * Return: The page containing the file data for this index.
+ */
+static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
+{
+	/* HugeTLBfs indexes the page cache in units of hpage_size */
+	if (folio_test_hugetlb(folio))
+		return &folio->page;
+	return folio_page(folio, index & (folio_nr_pages(folio) - 1));
+}
+
+/**
+ * folio_contains - Does this folio contain this index?
+ * @folio: The folio.
+ * @index: The page index within the file.
+ *
+ * Context: The caller should have the page locked in order to prevent
+ * (eg) shmem from moving the page between the page cache and swap cache
+ * and changing its index in the middle of the operation.
+ * Return: true or false.
+ */
+static inline bool folio_contains(struct folio *folio, pgoff_t index)
+{
+	/* HugeTLBfs indexes the page cache in units of hpage_size */
+	if (folio_test_hugetlb(folio))
+		return folio->index == index;
+	return index - folio_index(folio) < folio_nr_pages(folio);
+}
+
 /*
  * Given the page we found in the page cache, return the page corresponding
  * to this index in the file
-- 
cgit v1.2.3


From f94b18f6653ab6d3aa503213c2e1e79b18f05a30 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 21 Mar 2021 16:24:31 -0400
Subject: mm/filemap: Add folio_next_index()

This helper returns the page index of the next folio in the file (ie
the end of this folio, plus one).

No changes to generated code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/pagemap.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e7b46223239a..95e83a9e73c7 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -406,6 +406,17 @@ static inline pgoff_t folio_index(struct folio *folio)
         return folio->index;
 }
 
+/**
+ * folio_next_index - Get the index of the next folio.
+ * @folio: The current folio.
+ *
+ * Return: The index of the folio which follows this folio in the file.
+ */
+static inline pgoff_t folio_next_index(struct folio *folio)
+{
+	return folio->index + folio_nr_pages(folio);
+}
+
 /**
  * folio_file_page - The page for a particular index.
  * @folio: The folio which contains this index.
-- 
cgit v1.2.3


From 352b47a6984470954e1e262126b1da5312c40b09 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 24 Dec 2020 07:25:19 -0500
Subject: mm/filemap: Add folio_pos() and folio_file_pos()

These are just wrappers around page_offset() and page_file_offset()
respectively.  No change to generated code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 include/linux/pagemap.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 95e83a9e73c7..3354117a1dae 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -561,6 +561,27 @@ static inline loff_t page_file_offset(struct page *page)
 	return ((loff_t)page_index(page)) << PAGE_SHIFT;
 }
 
+/**
+ * folio_pos - Returns the byte position of this folio in its file.
+ * @folio: The folio.
+ */
+static inline loff_t folio_pos(struct folio *folio)
+{
+	return page_offset(&folio->page);
+}
+
+/**
+ * folio_file_pos - Returns the byte position of this folio in its file.
+ * @folio: The folio.
+ *
+ * This differs from folio_pos() for folios which belong to a swap file.
+ * NFS is the only filesystem today which needs to use folio_file_pos().
+ */
+static inline loff_t folio_file_pos(struct folio *folio)
+{
+	return page_file_offset(&folio->page);
+}
+
 extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
 				     unsigned long address);
 
-- 
cgit v1.2.3


From 2f52578f9c64d7d9a96ab81c243cc20804fabf2b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 10 Dec 2020 10:55:05 -0500
Subject: mm/util: Add folio_mapping() and folio_file_mapping()

These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping().  Adjust page_file_mapping() and page_mapping_file()
to use folios internally.  Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.

This ends up saving 122 bytes of text overall.  folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes.  The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()).  Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:

   48 8b 56 08         mov    0x8(%rsi),%rdx
   48 8d 42 ff         lea    -0x1(%rdx),%rax
   83 e2 01            and    $0x1,%edx
   48 0f 44 c6         cmove  %rsi,%rax

to become:

   48 8b 46 08         mov    0x8(%rsi),%rax
   48 8d 78 ff         lea    -0x1(%rax),%rdi
   a8 01               test   $0x1,%al
   48 0f 44 fe         cmove  %rsi,%rdi

for a reduction of a single byte.  Once the NFS client is converted to
use folios, this entire sequence will disappear.

Also add folio_mapping() documentation.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 include/linux/mm.h      | 14 --------------
 include/linux/pagemap.h | 35 +++++++++++++++++++++++++++++++++--
 include/linux/swap.h    |  6 ++++++
 3 files changed, 39 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 63685d953ce0..bc5c38e1f780 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1755,19 +1755,6 @@ void page_address_init(void);
 
 extern void *page_rmapping(struct page *page);
 extern struct anon_vma *page_anon_vma(struct page *page);
-extern struct address_space *page_mapping(struct page *page);
-
-extern struct address_space *__page_file_mapping(struct page *);
-
-static inline
-struct address_space *page_file_mapping(struct page *page)
-{
-	if (unlikely(PageSwapCache(page)))
-		return __page_file_mapping(page);
-
-	return page->mapping;
-}
-
 extern pgoff_t __page_file_index(struct page *page);
 
 /*
@@ -1782,7 +1769,6 @@ static inline pgoff_t page_index(struct page *page)
 }
 
 bool page_mapped(struct page *page);
-struct address_space *page_mapping(struct page *page);
 
 /*
  * Return true only if the page has been allocated with
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3354117a1dae..0ca96a40fabe 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -162,14 +162,45 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
 
 void release_pages(struct page **pages, int nr);
 
+struct address_space *page_mapping(struct page *);
+struct address_space *folio_mapping(struct folio *);
+struct address_space *swapcache_mapping(struct folio *);
+
+/**
+ * folio_file_mapping - Find the mapping this folio belongs to.
+ * @folio: The folio.
+ *
+ * For folios which are in the page cache, return the mapping that this
+ * page belongs to.  Folios in the swap cache return the mapping of the
+ * swap file or swap device where the data is stored.  This is different
+ * from the mapping returned by folio_mapping().  The only reason to
+ * use it is if, like NFS, you return 0 from ->activate_swapfile.
+ *
+ * Do not call this for folios which aren't in the page cache or swap cache.
+ */
+static inline struct address_space *folio_file_mapping(struct folio *folio)
+{
+	if (unlikely(folio_test_swapcache(folio)))
+		return swapcache_mapping(folio);
+
+	return folio->mapping;
+}
+
+static inline struct address_space *page_file_mapping(struct page *page)
+{
+	return folio_file_mapping(page_folio(page));
+}
+
 /*
  * For file cache pages, return the address_space, otherwise return NULL
  */
 static inline struct address_space *page_mapping_file(struct page *page)
 {
-	if (unlikely(PageSwapCache(page)))
+	struct folio *folio = page_folio(page);
+
+	if (unlikely(folio_test_swapcache(folio)))
 		return NULL;
-	return page_mapping(page);
+	return folio_mapping(folio);
 }
 
 static inline bool page_cache_add_speculative(struct page *page, int count)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ba52f3a3478e..85607c6c0cba 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -320,6 +320,12 @@ struct vma_swap_readahead {
 #endif
 };
 
+static inline swp_entry_t folio_swap_entry(struct folio *folio)
+{
+	swp_entry_t entry = { .val = page_private(&folio->page) };
+	return entry;
+}
+
 /* linux/mm/workingset.c */
 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
 void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
-- 
cgit v1.2.3


From 4e1364286d0a2dd384bceb6db6185b99c0e2c0bc Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 7 Dec 2020 15:44:35 -0500
Subject: mm/filemap: Add folio_unlock()

Convert unlock_page() to call folio_unlock().  By using a folio we
avoid a call to compound_head().  This shortens the function from 39
bytes to 25 and removes 4 instructions on x86-64.  Because we still
have unlock_page(), it's a net increase of 16 bytes of text for the
kernel as a whole, but any path that uses folio_unlock() will execute
4 fewer instructions.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/pagemap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0ca96a40fabe..8087921641a3 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -657,7 +657,8 @@ extern int __lock_page_killable(struct page *page);
 extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 				unsigned int flags);
-extern void unlock_page(struct page *page);
+void unlock_page(struct page *page);
+void folio_unlock(struct folio *folio);
 
 /*
  * Return true if the page was successfully locked
-- 
cgit v1.2.3


From 7c23c782d5d57df97509ed2fc17f9b9490f18f1b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 1 Mar 2021 19:38:25 -0500
Subject: mm/filemap: Add folio_lock()

This is like lock_page() but for use by callers who know they have a folio.
Convert __lock_page() to be __folio_lock().  This saves one call to
compound_head() per contended call to lock_page().

Saves 455 bytes of text; mostly from improved register allocation and
inlining decisions.  __folio_lock is 59 bytes while __lock_page was 79.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/pagemap.h | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8087921641a3..6481a431ea40 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -652,7 +652,7 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page,
 	return true;
 }
 
-extern void __lock_page(struct page *page);
+void __folio_lock(struct folio *folio);
 extern int __lock_page_killable(struct page *page);
 extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
@@ -660,13 +660,24 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 void unlock_page(struct page *page);
 void folio_unlock(struct folio *folio);
 
+static inline bool folio_trylock(struct folio *folio)
+{
+	return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0)));
+}
+
 /*
  * Return true if the page was successfully locked
  */
 static inline int trylock_page(struct page *page)
 {
-	page = compound_head(page);
-	return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
+	return folio_trylock(page_folio(page));
+}
+
+static inline void folio_lock(struct folio *folio)
+{
+	might_sleep();
+	if (!folio_trylock(folio))
+		__folio_lock(folio);
 }
 
 /*
@@ -674,9 +685,12 @@ static inline int trylock_page(struct page *page)
  */
 static inline void lock_page(struct page *page)
 {
+	struct folio *folio;
 	might_sleep();
-	if (!trylock_page(page))
-		__lock_page(page);
+
+	folio = page_folio(page);
+	if (!folio_trylock(folio))
+		__folio_lock(folio);
 }
 
 /*
-- 
cgit v1.2.3


From af7f29d9e1a7bda1429923327421367b69aa2e70 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 8 Dec 2020 00:07:31 -0500
Subject: mm/filemap: Add folio_lock_killable()

This is like lock_page_killable() but for use by callers who
know they have a folio.  Convert __lock_page_killable() to be
__folio_lock_killable().  This saves one call to compound_head() per
contended call to lock_page_killable().

__folio_lock_killable() is 19 bytes smaller than __lock_page_killable()
was.  filemap_fault() shrinks by 74 bytes and __lock_page_or_retry()
shrinks by 71 bytes.  That's a total of 164 bytes of text saved.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 include/linux/pagemap.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 6481a431ea40..cf0ebd8c9e86 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -653,7 +653,7 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page,
 }
 
 void __folio_lock(struct folio *folio);
-extern int __lock_page_killable(struct page *page);
+int __folio_lock_killable(struct folio *folio);
 extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 				unsigned int flags);
@@ -693,6 +693,14 @@ static inline void lock_page(struct page *page)
 		__folio_lock(folio);
 }
 
+static inline int folio_lock_killable(struct folio *folio)
+{
+	might_sleep();
+	if (!folio_trylock(folio))
+		return __folio_lock_killable(folio);
+	return 0;
+}
+
 /*
  * lock_page_killable is like lock_page but can be interrupted by fatal
  * signals.  It returns 0 if it locked the page and -EINTR if it was
@@ -700,10 +708,7 @@ static inline void lock_page(struct page *page)
  */
 static inline int lock_page_killable(struct page *page)
 {
-	might_sleep();
-	if (!trylock_page(page))
-		return __lock_page_killable(page);
-	return 0;
+	return folio_lock_killable(page_folio(page));
 }
 
 /*
-- 
cgit v1.2.3


From ffdc8dabf20b1b894eda63e7ec9ca15ab0b7292c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 30 Dec 2020 17:58:40 -0500
Subject: mm/filemap: Add __folio_lock_async()

There aren't any actual callers of lock_page_async(), so remove it.
Convert filemap_update_page() to call __folio_lock_async().

__folio_lock_async() is 21 bytes smaller than __lock_page_async(),
but the real savings come from using a folio in filemap_update_page(),
shrinking it from 515 bytes to 404 bytes, saving 110 bytes.  The text
shrinks by 132 bytes in total.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/pagemap.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index cf0ebd8c9e86..5b5e8bd0b3fb 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -654,7 +654,6 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page,
 
 void __folio_lock(struct folio *folio);
 int __folio_lock_killable(struct folio *folio);
-extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 				unsigned int flags);
 void unlock_page(struct page *page);
@@ -711,22 +710,6 @@ static inline int lock_page_killable(struct page *page)
 	return folio_lock_killable(page_folio(page));
 }
 
-/*
- * lock_page_async - Lock the page, unless this would block. If the page
- * is already locked, then queue a callback when the page becomes unlocked.
- * This callback can then retry the operation.
- *
- * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page
- * was already locked and the callback defined in 'wait' was queued.
- */
-static inline int lock_page_async(struct page *page,
-				  struct wait_page_queue *wait)
-{
-	if (!trylock_page(page))
-		return __lock_page_async(page, wait);
-	return 0;
-}
-
 /*
  * lock_page_or_retry - Lock the page, unless this would block and the
  * caller indicated that it can handle a retry.
-- 
cgit v1.2.3


From 6baa8d602e84d97a7541ed94ccaeb6a3f9763111 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 4 Mar 2021 10:21:02 -0500
Subject: mm/filemap: Add folio_wait_locked()

Also add folio_wait_locked_killable().  Turn wait_on_page_locked() and
wait_on_page_locked_killable() into wrappers.  This eliminates a call
to compound_head() from each call-site, reducing text size by 193 bytes
for me.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/pagemap.h | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 5b5e8bd0b3fb..77dbb7f625af 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -732,23 +732,33 @@ extern void wait_on_page_bit(struct page *page, int bit_nr);
 extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
 
 /* 
- * Wait for a page to be unlocked.
+ * Wait for a folio to be unlocked.
  *
- * This must be called with the caller "holding" the page,
- * ie with increased "page->count" so that the page won't
+ * This must be called with the caller "holding" the folio,
+ * ie with increased "page->count" so that the folio won't
  * go away during the wait..
  */
+static inline void folio_wait_locked(struct folio *folio)
+{
+	if (folio_test_locked(folio))
+		wait_on_page_bit(&folio->page, PG_locked);
+}
+
+static inline int folio_wait_locked_killable(struct folio *folio)
+{
+	if (!folio_test_locked(folio))
+		return 0;
+	return wait_on_page_bit_killable(&folio->page, PG_locked);
+}
+
 static inline void wait_on_page_locked(struct page *page)
 {
-	if (PageLocked(page))
-		wait_on_page_bit(compound_head(page), PG_locked);
+	folio_wait_locked(page_folio(page));
 }
 
 static inline int wait_on_page_locked_killable(struct page *page)
 {
-	if (!PageLocked(page))
-		return 0;
-	return wait_on_page_bit_killable(compound_head(page), PG_locked);
+	return folio_wait_locked_killable(page_folio(page));
 }
 
 int put_and_wait_on_page_locked(struct page *page, int state);
-- 
cgit v1.2.3


From 9138e47ed425246102317dbe452f7f0d4a54c4a2 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 18 Mar 2021 21:39:45 -0400
Subject: mm/filemap: Add __folio_lock_or_retry()

Convert __lock_page_or_retry() to __folio_lock_or_retry().  This actually
saves 4 bytes in the only caller of lock_page_or_retry() (due to better
register allocation) and saves the 14 byte cost of calling page_folio()
in __folio_lock_or_retry() for a total saving of 18 bytes.  Also use
a bool for the return type.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/pagemap.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 77dbb7f625af..7cf140f98910 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -654,7 +654,7 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page,
 
 void __folio_lock(struct folio *folio);
 int __folio_lock_killable(struct folio *folio);
-extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
 				unsigned int flags);
 void unlock_page(struct page *page);
 void folio_unlock(struct folio *folio);
@@ -715,13 +715,16 @@ static inline int lock_page_killable(struct page *page)
  * caller indicated that it can handle a retry.
  *
  * Return value and mmap_lock implications depend on flags; see
- * __lock_page_or_retry().
+ * __folio_lock_or_retry().
  */
-static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
+static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm,
 				     unsigned int flags)
 {
+	struct folio *folio;
 	might_sleep();
-	return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
+
+	folio = page_folio(page);
+	return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags);
 }
 
 /*
-- 
cgit v1.2.3


From 575ced1c8b0d3b578b933a68ce67ddaff3df9506 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 8 Dec 2020 01:25:39 -0500
Subject: mm/swap: Add folio_rotate_reclaimable()

Convert rotate_reclaimable_page() to folio_rotate_reclaimable().  This
eliminates all five of the calls to compound_head() in this function,
saving 75 bytes at the cost of adding 15 bytes to its one caller,
end_page_writeback().  We also save 36 bytes from pagevec_move_tail_fn()
due to using folios there.  Net 96 bytes savings.

Also move its declaration to mm/internal.h as it's only used by filemap.c.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 include/linux/swap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 85607c6c0cba..c7ecd3ad8e2e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -371,7 +371,6 @@ extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_cpu_zone(struct zone *zone);
 extern void lru_add_drain_all(void);
-extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
 extern void deactivate_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
-- 
cgit v1.2.3


From 4268b48077e55a93959f368aa9d3103ede5d3f0f Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 3 Mar 2021 15:21:55 -0500
Subject: mm/filemap: Add folio_end_writeback()

Add an end_page_writeback() wrapper function for users that are not yet
converted to folios.

folio_end_writeback() is less than half the size of end_page_writeback()
at just 105 bytes compared to 228 bytes, due to removing all the
compound_head() calls.  The 30 byte wrapper function makes this a net
saving of 93 bytes.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/pagemap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7cf140f98910..d9dcd335cc18 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -767,7 +767,8 @@ static inline int wait_on_page_locked_killable(struct page *page)
 int put_and_wait_on_page_locked(struct page *page, int state);
 void wait_on_page_writeback(struct page *page);
 int wait_on_page_writeback_killable(struct page *page);
-extern void end_page_writeback(struct page *page);
+void end_page_writeback(struct page *page);
+void folio_end_writeback(struct folio *folio);
 void wait_for_stable_page(struct page *page);
 
 void __set_page_dirty(struct page *, struct address_space *, int warn);
-- 
cgit v1.2.3


From 490e016f229a79dc7551e7f0e989d2304416c189 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 4 Mar 2021 11:09:17 -0500
Subject: mm/writeback: Add folio_wait_writeback()

wait_on_page_writeback_killable() only has one caller, so convert it to
call folio_wait_writeback_killable().  For the wait_on_page_writeback()
callers, add a compatibility wrapper around folio_wait_writeback().

Turning PageWriteback() into folio_test_writeback() eliminates a call
to compound_head() which saves 8 bytes and 15 bytes in the two
functions.  Unfortunately, that is more than offset by adding the
wait_on_page_writeback compatibility wrapper for a net increase in text
of 7 bytes.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 include/linux/pagemap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d9dcd335cc18..e6013c063986 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -766,7 +766,8 @@ static inline int wait_on_page_locked_killable(struct page *page)
 
 int put_and_wait_on_page_locked(struct page *page, int state);
 void wait_on_page_writeback(struct page *page);
-int wait_on_page_writeback_killable(struct page *page);
+void folio_wait_writeback(struct folio *folio);
+int folio_wait_writeback_killable(struct folio *folio);
 void end_page_writeback(struct page *page);
 void folio_end_writeback(struct folio *folio);
 void wait_for_stable_page(struct page *page);
-- 
cgit v1.2.3


From a49d0c507759214a7cfd26555382c314db486792 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 4 Mar 2021 11:25:25 -0500
Subject: mm/writeback: Add folio_wait_stable()

Move wait_for_stable_page() into the folio compatibility file.
folio_wait_stable() avoids a call to compound_head() and is 14 bytes
smaller than wait_for_stable_page() was.  The net text size grows by 16
bytes as a result of this patch.  We can also remove thp_head() as this
was the last user.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 include/linux/huge_mm.h | 15 ---------------
 include/linux/pagemap.h |  1 +
 2 files changed, 1 insertion(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f123e15d966e..f280f33ff223 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -250,15 +250,6 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
 		return NULL;
 }
 
-/**
- * thp_head - Head page of a transparent huge page.
- * @page: Any page (tail, head or regular) found in the page cache.
- */
-static inline struct page *thp_head(struct page *page)
-{
-	return compound_head(page);
-}
-
 /**
  * thp_order - Order of a transparent huge page.
  * @page: Head page of a transparent huge page.
@@ -336,12 +327,6 @@ static inline struct list_head *page_deferred_list(struct page *page)
 #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; })
 #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })
 
-static inline struct page *thp_head(struct page *page)
-{
-	VM_BUG_ON_PGFLAGS(PageTail(page), page);
-	return page;
-}
-
 static inline unsigned int thp_order(struct page *page)
 {
 	VM_BUG_ON_PGFLAGS(PageTail(page), page);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e6013c063986..ee39ad7b42f1 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -771,6 +771,7 @@ int folio_wait_writeback_killable(struct folio *folio);
 void end_page_writeback(struct page *page);
 void folio_end_writeback(struct folio *folio);
 void wait_for_stable_page(struct page *page);
+void folio_wait_stable(struct folio *folio);
 
 void __set_page_dirty(struct page *, struct address_space *, int warn);
 int __set_page_dirty_nobuffers(struct page *page);
-- 
cgit v1.2.3


From 101c0bf67f50ca0e8b9da97b26f8dc7cb232b4d3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 4 Mar 2021 12:02:54 -0500
Subject: mm/filemap: Add folio_wait_bit()

Rename wait_on_page_bit() to folio_wait_bit().  We must always wait on
the folio, otherwise we won't be woken up due to the tail page hashing
to a different bucket from the head page.

This commit shrinks the kernel by 770 bytes, mostly due to moving
the page waitqueue lookup into folio_wait_bit_common().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/pagemap.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ee39ad7b42f1..2f481327dee8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -728,11 +728,11 @@ static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm,
 }
 
 /*
- * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc.,
+ * This is exported only for folio_wait_locked/folio_wait_writeback, etc.,
  * and should not be used directly.
  */
-extern void wait_on_page_bit(struct page *page, int bit_nr);
-extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
+void folio_wait_bit(struct folio *folio, int bit_nr);
+int folio_wait_bit_killable(struct folio *folio, int bit_nr);
 
 /* 
  * Wait for a folio to be unlocked.
@@ -744,14 +744,14 @@ extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
 static inline void folio_wait_locked(struct folio *folio)
 {
 	if (folio_test_locked(folio))
-		wait_on_page_bit(&folio->page, PG_locked);
+		folio_wait_bit(folio, PG_locked);
 }
 
 static inline int folio_wait_locked_killable(struct folio *folio)
 {
 	if (!folio_test_locked(folio))
 		return 0;
-	return wait_on_page_bit_killable(&folio->page, PG_locked);
+	return folio_wait_bit_killable(folio, PG_locked);
 }
 
 static inline void wait_on_page_locked(struct page *page)
-- 
cgit v1.2.3


From df4d4f12739495332e0d1f916ef4270f7d25d207 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 16 Jan 2021 11:22:14 -0500
Subject: mm/filemap: Convert page wait queues to be folios

Reinforce that page flags are actually in the head page by changing the
type from page to folio.  Increases the size of cachefiles by two bytes,
but the kernel core is unchanged in size.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 include/linux/pagemap.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2f481327dee8..ebc62e9e453b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -628,13 +628,13 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 }
 
 struct wait_page_key {
-	struct page *page;
+	struct folio *folio;
 	int bit_nr;
 	int page_match;
 };
 
 struct wait_page_queue {
-	struct page *page;
+	struct folio *folio;
 	int bit_nr;
 	wait_queue_entry_t wait;
 };
@@ -642,7 +642,7 @@ struct wait_page_queue {
 static inline bool wake_page_match(struct wait_page_queue *wait_page,
 				  struct wait_page_key *key)
 {
-	if (wait_page->page != key->page)
+	if (wait_page->folio != key->folio)
 	       return false;
 	key->page_match = 1;
 
@@ -802,7 +802,7 @@ int wait_on_page_private_2_killable(struct page *page);
 /*
  * Add an arbitrary waiter to a page's wait queue
  */
-extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
+void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter);
 
 /*
  * Fault everything in given userspace address range in.
-- 
cgit v1.2.3


From b47393f8448ade8bafe09ed302ce2a15093e9718 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 22 Apr 2021 22:58:32 -0400
Subject: mm/filemap: Add folio private_2 functions

end_page_private_2() becomes folio_end_private_2(),
wait_on_page_private_2() becomes folio_wait_private_2() and
wait_on_page_private_2_killable() becomes folio_wait_private_2_killable().

Adjust the fscache equivalents to call page_folio() before calling these
functions to avoid adding wrappers.  Ends up costing 1 byte of text
in ceph & netfs, but the core shrinks by three calls to page_folio().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 include/linux/netfs.h   | 6 +++---
 include/linux/pagemap.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 5d6a4158a9a6..3d4cbf2f7dc4 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -55,7 +55,7 @@ static inline void set_page_fscache(struct page *page)
  */
 static inline void end_page_fscache(struct page *page)
 {
-	end_page_private_2(page);
+	folio_end_private_2(page_folio(page));
 }
 
 /**
@@ -66,7 +66,7 @@ static inline void end_page_fscache(struct page *page)
  */
 static inline void wait_on_page_fscache(struct page *page)
 {
-	wait_on_page_private_2(page);
+	folio_wait_private_2(page_folio(page));
 }
 
 /**
@@ -82,7 +82,7 @@ static inline void wait_on_page_fscache(struct page *page)
  */
 static inline int wait_on_page_fscache_killable(struct page *page)
 {
-	return wait_on_page_private_2_killable(page);
+	return folio_wait_private_2_killable(page_folio(page));
 }
 
 enum netfs_read_source {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ebc62e9e453b..05f91bfc048d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -795,9 +795,9 @@ static inline void set_page_private_2(struct page *page)
 	SetPagePrivate2(page);
 }
 
-void end_page_private_2(struct page *page);
-void wait_on_page_private_2(struct page *page);
-int wait_on_page_private_2_killable(struct page *page);
+void folio_end_private_2(struct folio *folio);
+void folio_wait_private_2(struct folio *folio);
+int folio_wait_private_2_killable(struct folio *folio);
 
 /*
  * Add an arbitrary waiter to a page's wait queue
-- 
cgit v1.2.3


From 6abbaa5b01730a1f0883d199cf5a90ae5c5dccea Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 27 Apr 2021 14:24:30 -0400
Subject: fs/netfs: Add folio fscache functions

Match the page writeback functions by adding
folio_start_fscache(), folio_end_fscache(), folio_wait_fscache() and
folio_wait_fscache_killable().  Remove set_page_private_2().  Also rewrite
the kernel-doc to describe when to use the function rather than what the
function does, and include the kernel-doc in the appropriate rst file.
Saves 31 bytes of text in netfs_rreq_unlock() due to set_page_fscache()
calling page_folio() once instead of three times.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 include/linux/netfs.h   | 75 +++++++++++++++++++++++++++++++------------------
 include/linux/pagemap.h | 16 -----------
 2 files changed, 48 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 3d4cbf2f7dc4..12c4177f7703 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -22,6 +22,7 @@
  * Overload PG_private_2 to give us PG_fscache - this is used to indicate that
  * a page is currently backed by a local disk cache
  */
+#define folio_test_fscache(folio)	folio_test_private_2(folio)
 #define PageFsCache(page)		PagePrivate2((page))
 #define SetPageFsCache(page)		SetPagePrivate2((page))
 #define ClearPageFsCache(page)		ClearPagePrivate2((page))
@@ -29,57 +30,77 @@
 #define TestClearPageFsCache(page)	TestClearPagePrivate2((page))
 
 /**
- * set_page_fscache - Set PG_fscache on a page and take a ref
- * @page: The page.
+ * folio_start_fscache - Start an fscache write on a folio.
+ * @folio: The folio.
  *
- * Set the PG_fscache (PG_private_2) flag on a page and take the reference
- * needed for the VM to handle its lifetime correctly.  This sets the flag and
- * takes the reference unconditionally, so care must be taken not to set the
- * flag again if it's already set.
+ * Call this function before writing a folio to a local cache.  Starting a
+ * second write before the first one finishes is not allowed.
  */
-static inline void set_page_fscache(struct page *page)
+static inline void folio_start_fscache(struct folio *folio)
 {
-	set_page_private_2(page);
+	VM_BUG_ON_FOLIO(folio_test_private_2(folio), folio);
+	folio_get(folio);
+	folio_set_private_2(folio);
 }
 
 /**
- * end_page_fscache - Clear PG_fscache and release any waiters
- * @page: The page
- *
- * Clear the PG_fscache (PG_private_2) bit on a page and wake up any sleepers
- * waiting for this.  The page ref held for PG_private_2 being set is released.
+ * folio_end_fscache - End an fscache write on a folio.
+ * @folio: The folio.
  *
- * This is, for example, used when a netfs page is being written to a local
- * disk cache, thereby allowing writes to the cache for the same page to be
- * serialised.
+ * Call this function after the folio has been written to the local cache.
+ * This will wake any sleepers waiting on this folio.
  */
-static inline void end_page_fscache(struct page *page)
+static inline void folio_end_fscache(struct folio *folio)
 {
-	folio_end_private_2(page_folio(page));
+	folio_end_private_2(folio);
 }
 
 /**
- * wait_on_page_fscache - Wait for PG_fscache to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_fscache - Wait for an fscache write on this folio to end.
+ * @folio: The folio.
  *
- * Wait for PG_fscache (aka PG_private_2) to be cleared on a page.
+ * If this folio is currently being written to a local cache, wait for
+ * the write to finish.  Another write may start after this one finishes,
+ * unless the caller holds the folio lock.
  */
-static inline void wait_on_page_fscache(struct page *page)
+static inline void folio_wait_fscache(struct folio *folio)
 {
-	folio_wait_private_2(page_folio(page));
+	folio_wait_private_2(folio);
 }
 
 /**
- * wait_on_page_fscache_killable - Wait for PG_fscache to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_fscache_killable - Wait for an fscache write on this folio to end.
+ * @folio: The folio.
  *
- * Wait for PG_fscache (aka PG_private_2) to be cleared on a page or until a
- * fatal signal is received by the calling task.
+ * If this folio is currently being written to a local cache, wait
+ * for the write to finish or for a fatal signal to be received.
+ * Another write may start after this one finishes, unless the caller
+ * holds the folio lock.
  *
  * Return:
  * - 0 if successful.
  * - -EINTR if a fatal signal was encountered.
  */
+static inline int folio_wait_fscache_killable(struct folio *folio)
+{
+	return folio_wait_private_2_killable(folio);
+}
+
+static inline void set_page_fscache(struct page *page)
+{
+	folio_start_fscache(page_folio(page));
+}
+
+static inline void end_page_fscache(struct page *page)
+{
+	folio_end_private_2(page_folio(page));
+}
+
+static inline void wait_on_page_fscache(struct page *page)
+{
+	folio_wait_private_2(page_folio(page));
+}
+
 static inline int wait_on_page_fscache_killable(struct page *page)
 {
 	return folio_wait_private_2_killable(page_folio(page));
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 05f91bfc048d..bdbd7be67812 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -779,22 +779,6 @@ int __set_page_dirty_no_writeback(struct page *page);
 
 void page_endio(struct page *page, bool is_write, int err);
 
-/**
- * set_page_private_2 - Set PG_private_2 on a page and take a ref
- * @page: The page.
- *
- * Set the PG_private_2 flag on a page and take the reference needed for the VM
- * to handle its lifetime correctly.  This sets the flag and takes the
- * reference unconditionally, so care must be taken not to set the flag again
- * if it's already set.
- */
-static inline void set_page_private_2(struct page *page)
-{
-	page = compound_head(page);
-	get_page(page);
-	SetPagePrivate2(page);
-}
-
 void folio_end_private_2(struct folio *folio);
 void folio_wait_private_2(struct folio *folio);
 int folio_wait_private_2_killable(struct folio *folio);
-- 
cgit v1.2.3


From dd10ab049beb479dc83bb14a7b5cd68c363983ce Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 12 Apr 2021 16:45:17 -0400
Subject: mm: Add folio_mapped()

This function is the equivalent of page_mapped().  It is slightly
shorter as we do not need to handle the PageTail() case.  Reimplement
page_mapped() as a wrapper around folio_mapped().  folio_mapped()
is 13 bytes smaller than page_mapped(), but the page_mapped() wrapper
is 30 bytes, for a net increase of 17 bytes of text.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/mm.h       | 1 +
 include/linux/mm_types.h | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bc5c38e1f780..95e36d0475ac 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1769,6 +1769,7 @@ static inline pgoff_t page_index(struct page *page)
 }
 
 bool page_mapped(struct page *page);
+bool folio_mapped(struct folio *folio);
 
 /*
  * Return true only if the page has been allocated with
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5ebcb86ac934..82dab23205c3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -299,6 +299,12 @@ FOLIO_MATCH(memcg_data, memcg_data);
 #endif
 #undef FOLIO_MATCH
 
+static inline atomic_t *folio_mapcount_ptr(struct folio *folio)
+{
+	struct page *tail = &folio->page + 1;
+	return &tail->compound_mapcount;
+}
+
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
 {
 	return &page[1].compound_mapcount;
-- 
cgit v1.2.3


From 874fd90cafdca9d678bceb88f91322af8c9a9d2d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 25 Jun 2021 09:27:29 -0400
Subject: mm: Add folio_nid()

This is the folio equivalent of page_to_nid().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 95e36d0475ac..04e41d4d85ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1429,6 +1429,11 @@ static inline int page_to_nid(const struct page *page)
 }
 #endif
 
+static inline int folio_nid(const struct folio *folio)
+{
+	return page_to_nid(&folio->page);
+}
+
 #ifdef CONFIG_NUMA_BALANCING
 static inline int cpu_pid_to_cpupid(int cpu, int pid)
 {
-- 
cgit v1.2.3


From 1b7e4464d43a488e383843bf96ec62d12393bff1 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 14:59:26 -0400
Subject: mm/memcg: Add folio_memcg() and related functions

memcg information is only stored in the head page, so the memcg
subsystem needs to assure that all accesses are to the head page.
The first step is converting page_memcg() to folio_memcg().

The callers of page_memcg() and PageMemcgKmem() are not yet ready to be
converted to use folios, so retain them as wrappers around folio_memcg()
and folio_memcg_kmem().  They will be converted in a later patch set.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/memcontrol.h | 110 ++++++++++++++++++++++++++-------------------
 1 file changed, 65 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3096c9a0ee01..06659670db32 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -369,7 +369,7 @@ enum page_memcg_data_flags {
 
 #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
 
-static inline bool PageMemcgKmem(struct page *page);
+static inline bool folio_memcg_kmem(struct folio *folio);
 
 /*
  * After the initialization objcg->memcg is always pointing at
@@ -384,73 +384,77 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
 }
 
 /*
- * __page_memcg - get the memory cgroup associated with a non-kmem page
- * @page: a pointer to the page struct
+ * __folio_memcg - Get the memory cgroup associated with a non-kmem folio
+ * @folio: Pointer to the folio.
  *
- * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * Returns a pointer to the memory cgroup associated with the folio,
+ * or NULL. This function assumes that the folio is known to have a
  * proper memory cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages or
- * kmem pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios or
+ * kmem folios.
  */
-static inline struct mem_cgroup *__page_memcg(struct page *page)
+static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
 {
-	unsigned long memcg_data = page->memcg_data;
+	unsigned long memcg_data = folio->memcg_data;
 
-	VM_BUG_ON_PAGE(PageSlab(page), page);
-	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
-	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
+	VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
+	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
 
 	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
 /*
- * __page_objcg - get the object cgroup associated with a kmem page
- * @page: a pointer to the page struct
+ * __folio_objcg - get the object cgroup associated with a kmem folio.
+ * @folio: Pointer to the folio.
  *
- * Returns a pointer to the object cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * Returns a pointer to the object cgroup associated with the folio,
+ * or NULL. This function assumes that the folio is known to have a
  * proper object cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages or
- * LRU pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios or
+ * LRU folios.
  */
-static inline struct obj_cgroup *__page_objcg(struct page *page)
+static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
 {
-	unsigned long memcg_data = page->memcg_data;
+	unsigned long memcg_data = folio->memcg_data;
 
-	VM_BUG_ON_PAGE(PageSlab(page), page);
-	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
-	VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
+	VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
+	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+	VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
 
 	return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
 /*
- * page_memcg - get the memory cgroup associated with a page
- * @page: a pointer to the page struct
+ * folio_memcg - Get the memory cgroup associated with a folio.
+ * @folio: Pointer to the folio.
  *
- * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * Returns a pointer to the memory cgroup associated with the folio,
+ * or NULL. This function assumes that the folio is known to have a
  * proper memory cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios.
  *
- * For a non-kmem page any of the following ensures page and memcg binding
+ * For a non-kmem folio any of the following ensures folio and memcg binding
  * stability:
  *
- * - the page lock
+ * - the folio lock
  * - LRU isolation
  * - lock_page_memcg()
  * - exclusive reference
  *
- * For a kmem page a caller should hold an rcu read lock to protect memcg
- * associated with a kmem page from being released.
+ * For a kmem folio a caller should hold an rcu read lock to protect memcg
+ * associated with a kmem folio from being released.
  */
+static inline struct mem_cgroup *folio_memcg(struct folio *folio)
+{
+	if (folio_memcg_kmem(folio))
+		return obj_cgroup_memcg(__folio_objcg(folio));
+	return __folio_memcg(folio);
+}
+
 static inline struct mem_cgroup *page_memcg(struct page *page)
 {
-	if (PageMemcgKmem(page))
-		return obj_cgroup_memcg(__page_objcg(page));
-	else
-		return __page_memcg(page);
+	return folio_memcg(page_folio(page));
 }
 
 /*
@@ -523,17 +527,18 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
 
 #ifdef CONFIG_MEMCG_KMEM
 /*
- * PageMemcgKmem - check if the page has MemcgKmem flag set
- * @page: a pointer to the page struct
+ * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
+ * @folio: Pointer to the folio.
  *
- * Checks if the page has MemcgKmem flag set. The caller must ensure that
- * the page has an associated memory cgroup. It's not safe to call this function
- * against some types of pages, e.g. slab pages.
+ * Checks if the folio has MemcgKmem flag set. The caller must ensure
+ * that the folio has an associated memory cgroup. It's not safe to call
+ * this function against some types of folios, e.g. slab folios.
  */
-static inline bool PageMemcgKmem(struct page *page)
+static inline bool folio_memcg_kmem(struct folio *folio)
 {
-	VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page);
-	return page->memcg_data & MEMCG_DATA_KMEM;
+	VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
+	VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio);
+	return folio->memcg_data & MEMCG_DATA_KMEM;
 }
 
 /*
@@ -577,7 +582,7 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 }
 
 #else
-static inline bool PageMemcgKmem(struct page *page)
+static inline bool folio_memcg_kmem(struct folio *folio)
 {
 	return false;
 }
@@ -593,6 +598,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 }
 #endif
 
+static inline bool PageMemcgKmem(struct page *page)
+{
+	return folio_memcg_kmem(page_folio(page));
+}
+
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return (memcg == root_mem_cgroup);
@@ -1115,6 +1125,11 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 #define MEM_CGROUP_ID_SHIFT	0
 #define MEM_CGROUP_ID_MAX	0
 
+static inline struct mem_cgroup *folio_memcg(struct folio *folio)
+{
+	return NULL;
+}
+
 static inline struct mem_cgroup *page_memcg(struct page *page)
 {
 	return NULL;
@@ -1131,6 +1146,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
 	return NULL;
 }
 
+static inline bool folio_memcg_kmem(struct folio *folio)
+{
+	return false;
+}
+
 static inline bool PageMemcgKmem(struct page *page)
 {
 	return false;
-- 
cgit v1.2.3


From 8f425e4ed0eb3ef0b2d85a9efccf947ca6aa9b1c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 25 Jun 2021 09:27:04 -0400
Subject: mm/memcg: Convert mem_cgroup_charge() to take a folio

Convert all callers of mem_cgroup_charge() to call page_folio() on the
page they're currently passing in.  Many of them will be converted to
use folios themselves soon.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/memcontrol.h | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 06659670db32..19a51729e00c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -694,14 +694,28 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
 		page_counter_read(&memcg->memory);
 }
 
-int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
-			gfp_t gfp_mask);
-static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
-				    gfp_t gfp_mask)
+int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
+
+/**
+ * mem_cgroup_charge - Charge a newly allocated folio to a cgroup.
+ * @folio: Folio to charge.
+ * @mm: mm context of the allocating task.
+ * @gfp: Reclaim mode.
+ *
+ * Try to charge @folio to the memcg that @mm belongs to, reclaiming
+ * pages according to @gfp if necessary.  If @mm is NULL, try to
+ * charge to the active memcg.
+ *
+ * Do not use this for folios allocated for swapin.
+ *
+ * Return: 0 on success. Otherwise, an error code is returned.
+ */
+static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
+				    gfp_t gfp)
 {
 	if (mem_cgroup_disabled())
 		return 0;
-	return __mem_cgroup_charge(page, mm, gfp_mask);
+	return __mem_cgroup_charge(folio, mm, gfp);
 }
 
 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
@@ -1199,8 +1213,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
 	return false;
 }
 
-static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
-				    gfp_t gfp_mask)
+static inline int mem_cgroup_charge(struct folio *folio,
+		struct mm_struct *mm, gfp_t gfp)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From bbc6b703b21963e909f633cf7718903ed5094319 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 1 May 2021 20:42:23 -0400
Subject: mm/memcg: Convert mem_cgroup_uncharge() to take a folio

Convert all the callers to call page_folio().  Most of them were already
using a head page, but a few of them I can't prove were, so this may
actually fix a bug.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/memcontrol.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 19a51729e00c..b4bc052db32b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -722,12 +722,19 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
 
-void __mem_cgroup_uncharge(struct page *page);
-static inline void mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct folio *folio);
+
+/**
+ * mem_cgroup_uncharge - Uncharge a folio.
+ * @folio: Folio to uncharge.
+ *
+ * Uncharge a folio previously charged with mem_cgroup_charge().
+ */
+static inline void mem_cgroup_uncharge(struct folio *folio)
 {
 	if (mem_cgroup_disabled())
 		return;
-	__mem_cgroup_uncharge(page);
+	__mem_cgroup_uncharge(folio);
 }
 
 void __mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -1229,7 +1236,7 @@ static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
 {
 }
 
-static inline void mem_cgroup_uncharge(struct page *page)
+static inline void mem_cgroup_uncharge(struct folio *folio)
 {
 }
 
-- 
cgit v1.2.3


From d21bba2b7d0ae19dd1279e10aee61c37a17aba74 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 6 May 2021 18:14:59 -0400
Subject: mm/memcg: Convert mem_cgroup_migrate() to take folios

Convert all callers of mem_cgroup_migrate() to call page_folio() first.
They all look like they're using head pages already, but this proves it.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/memcontrol.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b4bc052db32b..07eda24ec581 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -745,7 +745,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 	__mem_cgroup_uncharge_list(page_list);
 }
 
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
+void mem_cgroup_migrate(struct folio *old, struct folio *new);
 
 /**
  * mem_cgroup_lruvec - get the lru list vector for a memcg & node
@@ -1244,7 +1244,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 }
 
-static inline void mem_cgroup_migrate(struct page *old, struct page *new)
+static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
 {
 }
 
-- 
cgit v1.2.3


From 9d8053fc7a21ee2b3a540165d09418955258d9e8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 4 May 2021 11:43:01 -0400
Subject: mm/memcg: Convert mem_cgroup_track_foreign_dirty_slowpath() to folio

The page was only being used for the memcg and to gather trace
information, so this is a simple conversion.  The only caller of
mem_cgroup_track_foreign_dirty() will be converted to folios in a later
patch, so doing this now makes that patch simpler.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/memcontrol.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 07eda24ec581..1c2776c3a223 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1599,17 +1599,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 			 unsigned long *pheadroom, unsigned long *pdirty,
 			 unsigned long *pwriteback);
 
-void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
 					     struct bdi_writeback *wb);
 
 static inline void mem_cgroup_track_foreign_dirty(struct page *page,
 						  struct bdi_writeback *wb)
 {
+	struct folio *folio = page_folio(page);
 	if (mem_cgroup_disabled())
 		return;
 
-	if (unlikely(&page_memcg(page)->css != wb->memcg_css))
-		mem_cgroup_track_foreign_dirty_slowpath(page, wb);
+	if (unlikely(&folio_memcg(folio)->css != wb->memcg_css))
+		mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
 }
 
 void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
-- 
cgit v1.2.3


From f70ad448741580bf61cdfbeb02229c581409760a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 17:26:00 -0400
Subject: mm/memcg: Add folio_memcg_lock() and folio_memcg_unlock()

These are the folio equivalents of lock_page_memcg() and
unlock_page_memcg().

lock_page_memcg() and unlock_page_memcg() have too many callers to be
easily replaced in a single patch, so reimplement them as wrappers for
now to be cleaned up later when enough callers have been converted to
use folios.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/memcontrol.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1c2776c3a223..be85450f066f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -978,6 +978,8 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
 extern bool cgroup_memory_noswap;
 #endif
 
+void folio_memcg_lock(struct folio *folio);
+void folio_memcg_unlock(struct folio *folio);
 void lock_page_memcg(struct page *page);
 void unlock_page_memcg(struct page *page);
 
@@ -1397,6 +1399,14 @@ static inline void unlock_page_memcg(struct page *page)
 {
 }
 
+static inline void folio_memcg_lock(struct folio *folio)
+{
+}
+
+static inline void folio_memcg_unlock(struct folio *folio)
+{
+}
+
 static inline void mem_cgroup_handle_over_high(void)
 {
 }
-- 
cgit v1.2.3


From b1baabd995ab8e830dbf647fe731b51e12b8cedd Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 20:00:28 -0400
Subject: mm/memcg: Add folio_lruvec()

This replaces mem_cgroup_page_lruvec().  All callers converted.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/memcontrol.h | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index be85450f066f..35577caf27d9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -784,18 +784,17 @@ out:
 }
 
 /**
- * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
- * @page: the page
+ * folio_lruvec - return lruvec for isolating/putting an LRU folio
+ * @folio: Pointer to the folio.
  *
- * This function relies on page->mem_cgroup being stable.
+ * This function relies on folio->mem_cgroup being stable.
  */
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
+static inline struct lruvec *folio_lruvec(struct folio *folio)
 {
-	pg_data_t *pgdat = page_pgdat(page);
-	struct mem_cgroup *memcg = page_memcg(page);
+	struct mem_cgroup *memcg = folio_memcg(folio);
 
-	VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page);
-	return mem_cgroup_lruvec(memcg, pgdat);
+	VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio);
+	return mem_cgroup_lruvec(memcg, folio_pgdat(folio));
 }
 
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
@@ -1256,10 +1255,9 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
 	return &pgdat->__lruvec;
 }
 
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
+static inline struct lruvec *folio_lruvec(struct folio *folio)
 {
-	pg_data_t *pgdat = page_pgdat(page);
-
+	struct pglist_data *pgdat = folio_pgdat(folio);
 	return &pgdat->__lruvec;
 }
 
-- 
cgit v1.2.3


From e809c3fedeeb806993349e7bf797b4c2b728be7d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 21:59:47 -0400
Subject: mm/memcg: Add folio_lruvec_lock() and similar functions

These are the folio equivalents of lock_page_lruvec() and similar
functions.  Also convert lruvec_memcg_debug() to take a folio.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/memcontrol.h | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 35577caf27d9..30d2cd7b5c9e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -801,15 +801,16 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
-struct lruvec *lock_page_lruvec(struct page *page);
-struct lruvec *lock_page_lruvec_irq(struct page *page);
-struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+struct lruvec *folio_lruvec_lock(struct folio *folio);
+struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
+struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
 						unsigned long *flags);
 
 #ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio);
 #else
-static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+static inline
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
 {
 }
 #endif
@@ -1261,7 +1262,8 @@ static inline struct lruvec *folio_lruvec(struct folio *folio)
 	return &pgdat->__lruvec;
 }
 
-static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+static inline
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
 {
 }
 
@@ -1291,26 +1293,26 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
 
-static inline struct lruvec *lock_page_lruvec(struct page *page)
+static inline struct lruvec *folio_lruvec_lock(struct folio *folio)
 {
-	struct pglist_data *pgdat = page_pgdat(page);
+	struct pglist_data *pgdat = folio_pgdat(folio);
 
 	spin_lock(&pgdat->__lruvec.lru_lock);
 	return &pgdat->__lruvec;
 }
 
-static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
 {
-	struct pglist_data *pgdat = page_pgdat(page);
+	struct pglist_data *pgdat = folio_pgdat(folio);
 
 	spin_lock_irq(&pgdat->__lruvec.lru_lock);
 	return &pgdat->__lruvec;
 }
 
-static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
 		unsigned long *flagsp)
 {
-	struct pglist_data *pgdat = page_pgdat(page);
+	struct pglist_data *pgdat = folio_pgdat(folio);
 
 	spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
 	return &pgdat->__lruvec;
@@ -1576,6 +1578,7 @@ static inline bool page_matches_lruvec(struct page *page, struct lruvec *lruvec)
 static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
 		struct lruvec *locked_lruvec)
 {
+	struct folio *folio = page_folio(page);
 	if (locked_lruvec) {
 		if (page_matches_lruvec(page, locked_lruvec))
 			return locked_lruvec;
@@ -1583,13 +1586,14 @@ static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
 		unlock_page_lruvec_irq(locked_lruvec);
 	}
 
-	return lock_page_lruvec_irq(page);
+	return folio_lruvec_lock_irq(folio);
 }
 
 /* Don't lock again iff page's lruvec locked */
 static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page,
 		struct lruvec *locked_lruvec, unsigned long *flags)
 {
+	struct folio *folio = page_folio(page);
 	if (locked_lruvec) {
 		if (page_matches_lruvec(page, locked_lruvec))
 			return locked_lruvec;
@@ -1597,7 +1601,7 @@ static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page,
 		unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
 	}
 
-	return lock_page_lruvec_irqsave(page, flags);
+	return folio_lruvec_lock_irqsave(folio, flags);
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
-- 
cgit v1.2.3


From 0de340cbed3359423e38ed49242ac9d6986b5cfd Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 29 Jun 2021 22:27:31 -0400
Subject: mm/memcg: Add folio_lruvec_relock_irq() and
 folio_lruvec_relock_irqsave()

These are the folio equivalents of relock_page_lruvec_irq() and
folio_lruvec_relock_irqsave().  Also convert page_matches_lruvec()
to folio_matches_lruvec().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/memcontrol.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 30d2cd7b5c9e..05094eaf1d61 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1568,19 +1568,19 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
 }
 
 /* Test requires a stable page->memcg binding, see page_memcg() */
-static inline bool page_matches_lruvec(struct page *page, struct lruvec *lruvec)
+static inline bool folio_matches_lruvec(struct folio *folio,
+		struct lruvec *lruvec)
 {
-	return lruvec_pgdat(lruvec) == page_pgdat(page) &&
-	       lruvec_memcg(lruvec) == page_memcg(page);
+	return lruvec_pgdat(lruvec) == folio_pgdat(folio) &&
+	       lruvec_memcg(lruvec) == folio_memcg(folio);
 }
 
 /* Don't lock again iff page's lruvec locked */
-static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
+static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
 		struct lruvec *locked_lruvec)
 {
-	struct folio *folio = page_folio(page);
 	if (locked_lruvec) {
-		if (page_matches_lruvec(page, locked_lruvec))
+		if (folio_matches_lruvec(folio, locked_lruvec))
 			return locked_lruvec;
 
 		unlock_page_lruvec_irq(locked_lruvec);
@@ -1590,12 +1590,11 @@ static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
 }
 
 /* Don't lock again iff page's lruvec locked */
-static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page,
+static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio,
 		struct lruvec *locked_lruvec, unsigned long *flags)
 {
-	struct folio *folio = page_folio(page);
 	if (locked_lruvec) {
-		if (page_matches_lruvec(page, locked_lruvec))
+		if (folio_matches_lruvec(folio, locked_lruvec))
 			return locked_lruvec;
 
 		unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
-- 
cgit v1.2.3


From c5ce619a77ce00d537ef512e7a823c99ce890a40 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 4 May 2021 17:19:13 -0400
Subject: mm/workingset: Convert workingset_activation to take a folio

This function already assumed it was being passed a head page.  No real
change here, except that thp_nr_pages() compiles away on kernels with
THP compiled out while folio_nr_pages() is always present.  Also convert
page_memcg_rcu() to folio_memcg_rcu().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/memcontrol.h | 22 ++++++++++++----------
 include/linux/swap.h       |  2 +-
 2 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 05094eaf1d61..7bd78c13d1fa 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -457,20 +457,22 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
 	return folio_memcg(page_folio(page));
 }
 
-/*
- * page_memcg_rcu - locklessly get the memory cgroup associated with a page
- * @page: a pointer to the page struct
+/**
+ * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
+ * @folio: Pointer to the folio.
  *
- * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * This function assumes that the folio is known to have a
  * proper memory cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios.
+ *
+ * Return: A pointer to the memory cgroup associated with the folio,
+ * or NULL.
  */
-static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
 {
-	unsigned long memcg_data = READ_ONCE(page->memcg_data);
+	unsigned long memcg_data = READ_ONCE(folio->memcg_data);
 
-	VM_BUG_ON_PAGE(PageSlab(page), page);
+	VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	if (memcg_data & MEMCG_DATA_KMEM) {
@@ -1158,7 +1160,7 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
 	return NULL;
 }
 
-static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	return NULL;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index c7ecd3ad8e2e..0fc84797623f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -330,7 +330,7 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio)
 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
 void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
 void workingset_refault(struct page *page, void *shadow);
-void workingset_activation(struct page *page);
+void workingset_activation(struct folio *folio);
 
 /* Only track the nodes of mappings with shadow entries */
 void workingset_update_node(struct xa_node *node);
-- 
cgit v1.2.3


From bf6bd276b374d44f6e7146d52aa6097eb91384a3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 7 May 2021 10:55:27 -0400
Subject: mm: Add folio_pfn()

This is the folio equivalent of page_to_pfn().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 04e41d4d85ea..47143f3e7f0a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1624,6 +1624,20 @@ static inline unsigned long page_to_section(const struct page *page)
 }
 #endif
 
+/**
+ * folio_pfn - Return the Page Frame Number of a folio.
+ * @folio: The folio.
+ *
+ * A folio may contain multiple pages.  The pages have consecutive
+ * Page Frame Numbers.
+ *
+ * Return: The Page Frame Number of the first page in the folio.
+ */
+static inline unsigned long folio_pfn(struct folio *folio)
+{
+	return page_to_pfn(&folio->page);
+}
+
 /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
 #ifdef CONFIG_MIGRATION
 static inline bool is_pinnable_page(struct page *page)
-- 
cgit v1.2.3


From 2a5a8fa8b23144d14567d6f8293dd6fbeecee393 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 15 Sep 2021 18:16:01 +0200
Subject: leds: trigger: use RCU to protect the led_cdevs list

Even with the previous commit 27af8e2c90fb
("leds: trigger: fix potential deadlock with libata")
to this file, we still get lockdep unhappy, and Boqun
explained the report here:
https://lore.kernel.org/r/YNA+d1X4UkoQ7g8a@boqun-archlinux

Effectively, this means that the read_lock_irqsave() isn't
enough here because another CPU might be trying to do a
write lock, and thus block the readers.

This is all pretty messy, but it doesn't seem right that
the LEDs framework imposes some locking requirements on
users, in particular we'd have to make the spinlock in the
iwlwifi driver always disable IRQs, even if we don't need
that for any other reason, just to avoid this deadlock.

Since writes to the led_cdevs list are rare (and are done
by userspace), just switch the list to RCU. This costs a
synchronize_rcu() at removal time so we can ensure things
are correct, but that seems like a small price to pay for
getting lock-free iterations and no deadlocks (nor any
locking requirements imposed on users.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
---
 include/linux/leds.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index a0b730be40ad..ba4861ec73d3 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -360,7 +360,7 @@ struct led_trigger {
 	struct led_hw_trigger_type *trigger_type;
 
 	/* LEDs under control by this trigger (for simple triggers) */
-	rwlock_t	  leddev_list_lock;
+	spinlock_t	  leddev_list_lock;
 	struct list_head  led_cdevs;
 
 	/* Link to next registered trigger */
-- 
cgit v1.2.3


From 4795448117824cc4900d696f17d1516c56371045 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 18 Sep 2021 14:53:44 +0200
Subject: PCI: ACPI: Drop acpi_pci_bus

The acpi_pci_bus structure was used primarily for running
acpi_pci_find_companion() during PCI device objects registration,
but after commit 375553a93201 ("PCI: Setup ACPI fwnode early and at
the same time with OF") that function is called by pci_setup_device()
via pci_set_acpi_fwnode(), which happens before calling
pci_device_add() on the new PCI device object, so its ACPI companion
has been set already when acpi_device_notify() runs and it will never
call ->find_companion() from acpi_pci_bus.

For this reason, modify acpi_device_notify() and
acpi_device_notify_remove() to call pci_acpi_setup() and
pci_acpi_cleanup(), respectively, directly on PCI device objects
and drop acpi_pci_bus altogether.

While at it, notice that pci_acpi_setup() and pci_acpi_cleanup()
can obtain the ACPI companion pointer, which is guaranteed to not
be NULL, from their callers and modify them to work that way so
as to reduce the number of redundant checks somewhat.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Ferry Toth <fntoth@gmail.com>
---
 include/linux/pci-acpi.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index f16de399d2de..078225b514d4 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -84,6 +84,14 @@ extern struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 void acpi_pci_add_bus(struct pci_bus *bus);
 void acpi_pci_remove_bus(struct pci_bus *bus);
 
+#ifdef CONFIG_PCI
+void pci_acpi_setup(struct device *dev, struct acpi_device *adev);
+void pci_acpi_cleanup(struct device *dev, struct acpi_device *adev);
+#else
+static inline void pci_acpi_setup(struct device *dev, struct acpi_device *adev) {}
+static inline void pci_acpi_cleanup(struct device *dev, struct acpi_device *adev) {}
+#endif
+
 #ifdef	CONFIG_ACPI_PCI_SLOT
 void acpi_pci_slot_init(void);
 void acpi_pci_slot_enumerate(struct pci_bus *bus);
-- 
cgit v1.2.3


From e765f13ed126fe7e41d1a6e3c60d754cd6c2af93 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 22 Sep 2021 19:34:30 +0200
Subject: nvdimm/pmem: move dax_attribute_group from dax to pmem

dax_attribute_group is only used by the pmem driver, and can avoid the
completely pointless lookup by the disk name if moved there.  This
leaves just a single caller of dax_get_by_host, so move dax_get_by_host
into the same ifdef block as that caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/20210922173431.2454024-3-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 2619d94c308d..8623caa67388 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,8 +38,6 @@ struct dax_operations {
 	int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
 };
 
-extern struct attribute_group dax_attribute_group;
-
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *alloc_dax(void *private, const char *host,
 		const struct dax_operations *ops, unsigned long flags);
-- 
cgit v1.2.3


From 7b4d7894c65bd1e83b2f021d7944a46bdd64836a Mon Sep 17 00:00:00 2001
From: Deepak Kumar Singh <deesin@codeaurora.org>
Date: Tue, 31 Aug 2021 20:00:27 +0530
Subject: soc: qcom: aoss: Expose send for generic usecase

Not all upcoming usecases will have an interface to allow the aoss
driver to hook onto. Expose the send api and create a get function to
enable drivers to send their own messages to aoss.

Signed-off-by: Chris Lew <clew@codeaurora.org>
Signed-off-by: Deepak Kumar Singh <deesin@codeaurora.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/1630420228-31075-2-git-send-email-deesin@codeaurora.org
---
 include/linux/soc/qcom/qcom_aoss.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 include/linux/soc/qcom/qcom_aoss.h

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/qcom_aoss.h b/include/linux/soc/qcom/qcom_aoss.h
new file mode 100644
index 000000000000..3c2a82e606f8
--- /dev/null
+++ b/include/linux/soc/qcom/qcom_aoss.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef __QCOM_AOSS_H__
+#define __QCOM_AOSS_H__
+
+#include <linux/err.h>
+#include <linux/device.h>
+
+struct qmp;
+
+#if IS_ENABLED(CONFIG_QCOM_AOSS_QMP)
+
+int qmp_send(struct qmp *qmp, const void *data, size_t len);
+struct qmp *qmp_get(struct device *dev);
+void qmp_put(struct qmp *qmp);
+
+#else
+
+static inline int qmp_send(struct qmp *qmp, const void *data, size_t len)
+{
+	return -ENODEV;
+}
+
+static inline struct qmp *qmp_get(struct device *dev)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline void qmp_put(struct qmp *qmp)
+{
+}
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From 99139b80c1b3d73026ed8be2de42c52e2976ab64 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Mon, 27 Sep 2021 14:55:40 +0100
Subject: soc: qcom: apr: make code more reuseable

APR and other packet routers like GPR are pretty much same and
interact with other drivers in similar way.

Ex: GPR ports can be considered as APR services, only difference
is they are allocated dynamically.

Other difference is packet layout, which should not matter
with the apis abstracted. Apart from this the rest of the
functionality is pretty much identical across APR and GPR.

Make the apr code more reusable by abstracting it service level,
rather than device level so that we do not need to write
new drivers for other new packet routers like GPR.

This patch is in preparation to add GPR support to this driver.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20210927135559.738-4-srinivas.kandagatla@linaro.org
---
 include/linux/soc/qcom/apr.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/apr.h b/include/linux/soc/qcom/apr.h
index 137f9f2ac4c3..7bca213a3f83 100644
--- a/include/linux/soc/qcom/apr.h
+++ b/include/linux/soc/qcom/apr.h
@@ -79,6 +79,15 @@ struct apr_resp_pkt {
 #define APR_SVC_MAJOR_VERSION(v)	((v >> 16) & 0xFF)
 #define APR_SVC_MINOR_VERSION(v)	(v & 0xFF)
 
+struct packet_router;
+struct pkt_router_svc {
+	struct device *dev;
+	struct packet_router *pr;
+	spinlock_t lock;
+	int id;
+	void *priv;
+};
+
 struct apr_device {
 	struct device	dev;
 	uint16_t	svc_id;
@@ -86,11 +95,12 @@ struct apr_device {
 	uint32_t	version;
 	char name[APR_NAME_SIZE];
 	const char *service_path;
-	spinlock_t	lock;
+	struct pkt_router_svc svc;
 	struct list_head node;
 };
 
 #define to_apr_device(d) container_of(d, struct apr_device, dev)
+#define svc_to_apr_device(d) container_of(d, struct apr_device, svc)
 
 struct apr_driver {
 	int	(*probe)(struct apr_device *sl);
-- 
cgit v1.2.3


From ec1471a898cca38af6b8956a83ebc1297214546f Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Mon, 27 Sep 2021 14:55:42 +0100
Subject: soc: qcom: apr: Add GPR support

Qualcomm Generic Packet router aka GPR is the IPC mechanism found
in AudioReach next generation signal processing framework to perform
command and response messages between various processors.

GPR has concepts of static and dynamic port, all static services like
APM (Audio Processing Manager), PRM (Proxy resource manager) have
fixed port numbers where as dynamic services like graphs have dynamic
port numbers which are allocated at runtime. All GPR packet messages
will have source and destination domain and port along with opcode
and payload.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20210927135559.738-6-srinivas.kandagatla@linaro.org
---
 include/linux/soc/qcom/apr.h | 58 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/apr.h b/include/linux/soc/qcom/apr.h
index 7bca213a3f83..23c5b30f3511 100644
--- a/include/linux/soc/qcom/apr.h
+++ b/include/linux/soc/qcom/apr.h
@@ -7,6 +7,7 @@
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <dt-bindings/soc/qcom,apr.h>
+#include <dt-bindings/soc/qcom,gpr.h>
 
 extern struct bus_type aprbus;
 
@@ -75,19 +76,65 @@ struct apr_resp_pkt {
 	int payload_size;
 };
 
+struct gpr_hdr {
+	uint32_t version:4;
+	uint32_t hdr_size:4;
+	uint32_t pkt_size:24;
+	uint32_t dest_domain:8;
+	uint32_t src_domain:8;
+	uint32_t reserved:16;
+	uint32_t src_port;
+	uint32_t dest_port;
+	uint32_t token;
+	uint32_t opcode;
+} __packed;
+
+struct gpr_pkt {
+	struct gpr_hdr hdr;
+	uint32_t payload[];
+};
+
+struct gpr_resp_pkt {
+	struct gpr_hdr hdr;
+	void *payload;
+	int payload_size;
+};
+
+#define GPR_HDR_SIZE			sizeof(struct gpr_hdr)
+#define GPR_PKT_VER			0x0
+#define GPR_PKT_HEADER_WORD_SIZE	((sizeof(struct gpr_pkt) + 3) >> 2)
+#define GPR_PKT_HEADER_BYTE_SIZE	(GPR_PKT_HEADER_WORD_SIZE << 2)
+
+#define GPR_BASIC_RSP_RESULT		0x02001005
+
+struct gpr_ibasic_rsp_result_t {
+	uint32_t opcode;
+	uint32_t status;
+};
+
+#define GPR_BASIC_EVT_ACCEPTED		0x02001006
+
+struct gpr_ibasic_rsp_accepted_t {
+	uint32_t opcode;
+};
+
 /* Bits 0 to 15 -- Minor version,  Bits 16 to 31 -- Major version */
 #define APR_SVC_MAJOR_VERSION(v)	((v >> 16) & 0xFF)
 #define APR_SVC_MINOR_VERSION(v)	(v & 0xFF)
 
+typedef int (*gpr_port_cb) (struct gpr_resp_pkt *d, void *priv, int op);
 struct packet_router;
 struct pkt_router_svc {
 	struct device *dev;
+	gpr_port_cb callback;
 	struct packet_router *pr;
 	spinlock_t lock;
 	int id;
 	void *priv;
 };
 
+typedef struct pkt_router_svc gpr_port_t;
+
 struct apr_device {
 	struct device	dev;
 	uint16_t	svc_id;
@@ -99,6 +146,8 @@ struct apr_device {
 	struct list_head node;
 };
 
+typedef struct apr_device gpr_device_t;
+
 #define to_apr_device(d) container_of(d, struct apr_device, dev)
 #define svc_to_apr_device(d) container_of(d, struct apr_device, svc)
 
@@ -107,10 +156,12 @@ struct apr_driver {
 	int	(*remove)(struct apr_device *sl);
 	int	(*callback)(struct apr_device *a,
 			    struct apr_resp_pkt *d);
+	int	(*gpr_callback)(struct gpr_resp_pkt *d, void *data, int op);
 	struct device_driver		driver;
 	const struct apr_device_id	*id_table;
 };
 
+typedef struct apr_driver gpr_driver_t;
 #define to_apr_driver(d) container_of(d, struct apr_driver, driver)
 
 /*
@@ -133,7 +184,14 @@ void apr_driver_unregister(struct apr_driver *drv);
 #define module_apr_driver(__apr_driver) \
 	module_driver(__apr_driver, apr_driver_register, \
 			apr_driver_unregister)
+#define module_gpr_driver(__gpr_driver) module_apr_driver(__gpr_driver)
 
 int apr_send_pkt(struct apr_device *adev, struct apr_pkt *pkt);
 
+gpr_port_t *gpr_alloc_port(gpr_device_t *gdev, struct device *dev,
+				gpr_port_cb cb, void *priv);
+void gpr_free_port(gpr_port_t *port);
+int gpr_send_port_pkt(gpr_port_t *port, struct gpr_pkt *pkt);
+int gpr_send_pkt(gpr_device_t *gdev, struct gpr_pkt *pkt);
+
 #endif /* __QCOM_APR_H_ */
-- 
cgit v1.2.3


From af3826db74d184bc9c2c9d3ff34548e5f317a6f3 Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Tue, 28 Sep 2021 11:25:26 +0530
Subject: octeontx2-pf: Use hardware register for CQE count

Current driver uses software CQ head pointer to poll on CQE
header in memory to determine if CQE is valid. Software needs
to make sure, that the reads of the CQE do not get re-ordered
so much that it ends up with an inconsistent view of the CQE.
To ensure that DMB barrier after read to first CQE cacheline
and before reading of the rest of the CQE is needed.
But having barrier for every CQE read will impact the performance,
instead use hardware CQ head and tail pointers to find the
valid number of CQEs.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/soc/marvell/octeontx2/asm.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/marvell/octeontx2/asm.h b/include/linux/soc/marvell/octeontx2/asm.h
index fa1d6af0164e..0f79fd7f81a1 100644
--- a/include/linux/soc/marvell/octeontx2/asm.h
+++ b/include/linux/soc/marvell/octeontx2/asm.h
@@ -34,9 +34,23 @@
 			 : [rf] "+r"(val)		\
 			 : [rs] "r"(addr));		\
 })
+
+static inline u64 otx2_atomic64_fetch_add(u64 incr, u64 *ptr)
+{
+	u64 result;
+
+	asm volatile (".cpu  generic+lse\n"
+		      "ldadda %x[i], %x[r], [%[b]]"
+		      : [r] "=r" (result), "+m" (*ptr)
+		      : [i] "r" (incr), [b] "r" (ptr)
+		      : "memory");
+	return result;
+}
+
 #else
 #define otx2_lmt_flush(ioaddr)          ({ 0; })
 #define cn10k_lmt_flush(val, addr)	({ addr = val; })
+#define otx2_atomic64_fetch_add(incr, ptr)	({ incr; })
 #endif
 
 #endif /* __SOC_OTX2_ASM_H */
-- 
cgit v1.2.3


From 62b8963cd84df1fc04986cd9b27586acce758f36 Mon Sep 17 00:00:00 2001
From: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Date: Tue, 28 Sep 2021 14:00:45 +0300
Subject: ieee80211: Add new A-MPDU factor macro for HE 6 GHz peer caps

Add IEEE80211_HE_6GHZ_MAX_AMPDU_FACTOR as per IEEE Std 802.11ax-2021,
9.4.2.263 to use for peer max A-MPDU factor in 6 GHz band.

Signed-off-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Signed-off-by: Jouni Malinen <jouni@codeaurora.org>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Link: https://lore.kernel.org/r/20210913175510.193005-1-jouni@codeaurora.org
---
 include/linux/ieee80211.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 694264503119..a1a7eda35cb5 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2084,6 +2084,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 
 #define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR	20
 #define IEEE80211_HE_HT_MAX_AMPDU_FACTOR	16
+#define IEEE80211_HE_6GHZ_MAX_AMPDU_FACTOR	13
 
 /* 802.11ax HE PHY capabilities */
 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G		0x02
-- 
cgit v1.2.3


From 8de1e9b01b03d2805f58aa490ea5bf0df003bc80 Mon Sep 17 00:00:00 2001
From: Meir Lichtinger <meirl@nvidia.com>
Date: Wed, 22 Sep 2021 11:28:50 +0300
Subject: net/mlx5: Add uid field to UAR allocation structures

Add uid field to mlx5_ifc_alloc_uar_in_bits and
mlx5_ifc_dealloc_uar_out_bits structs.

This field will be used by FW to manage UAR according to uid.

Signed-off-by: Meir Lichtinger <meirl@nvidia.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f3638d09ba77..96f5fb2af811 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -7569,7 +7569,7 @@ struct mlx5_ifc_dealloc_uar_out_bits {
 
 struct mlx5_ifc_dealloc_uar_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -8416,7 +8416,7 @@ struct mlx5_ifc_alloc_uar_out_bits {
 
 struct mlx5_ifc_alloc_uar_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
-- 
cgit v1.2.3


From d2c8a1554c10d5e0443b1f97f480d7dacd55cf55 Mon Sep 17 00:00:00 2001
From: Meir Lichtinger <meirl@nvidia.com>
Date: Wed, 22 Sep 2021 11:28:51 +0300
Subject: IB/mlx5: Enable UAR to have DevX UID

UID field was added to alloc_uar and dealloc_uar PRM command, to specify
DevX UID for UAR. This change enables firmware validating user access to
its own UAR resources.

For the kernel allocated UARs the UID will stay 0 as of today.

Signed-off-by: Meir Lichtinger <meirl@nvidia.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/driver.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e23417424373..1b8bae246b28 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1005,8 +1005,6 @@ void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome);
 bool mlx5_cmd_is_down(struct mlx5_core_dev *dev);
 
 int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
-int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
-int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
 void mlx5_health_flush(struct mlx5_core_dev *dev);
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From af9d82626c8f8547910e3e22c76ced3f5d5f5286 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 24 Aug 2021 14:20:51 +0200
Subject: PCI/ACPI: Remove OSC_PCI_SUPPORT_MASKS and OSC_PCI_CONTROL_MASKS

These masks are only used internally in the PCI Host Bridge _OSC
negotiation code, which already makes sure nothing outside of these masks
is set. Remove the masks and simplify the code.

Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20210824122054.29481-2-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael@kernel.org>
---
 include/linux/acpi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 974d497a897d..eb59fd3052c0 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -577,7 +577,6 @@ extern u32 osc_sb_native_usb4_control;
 #define OSC_PCI_MSI_SUPPORT			0x00000010
 #define OSC_PCI_EDR_SUPPORT			0x00000080
 #define OSC_PCI_HPX_TYPE_3_SUPPORT		0x00000100
-#define OSC_PCI_SUPPORT_MASKS			0x0000019f
 
 /* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */
 #define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL	0x00000001
@@ -587,7 +586,6 @@ extern u32 osc_sb_native_usb4_control;
 #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL	0x00000010
 #define OSC_PCI_EXPRESS_LTR_CONTROL		0x00000020
 #define OSC_PCI_EXPRESS_DPC_CONTROL		0x00000080
-#define OSC_PCI_CONTROL_MASKS			0x000000bf
 
 #define ACPI_GSB_ACCESS_ATTRIB_QUICK		0x00000002
 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV         0x00000004
-- 
cgit v1.2.3


From fec2432c9a7370788faab416b38589ac9f4350e5 Mon Sep 17 00:00:00 2001
From: Diana Craciun <diana.craciun@oss.nxp.com>
Date: Wed, 22 Sep 2021 14:05:29 +0300
Subject: bus/fsl-mc: Add generic implementation for open/reset/close commands

The open/reset/close commands format is similar for all objects.
Currently there are multiple implementations for these commands
scattered through various drivers. The code is cavsi-identical.
Create a generic implementation for the open/reset/close commands.
One of the consumer will be the VFIO driver which needs to
be able to reset a device.

Signed-off-by: Diana Craciun <diana.craciun@oss.nxp.com>
Reviewed-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Link: https://lore.kernel.org/r/20210922110530.24736-1-diana.craciun@oss.nxp.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/fsl/mc.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index 30ece3ae6df7..e026f6c48b49 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -620,6 +620,20 @@ int dpcon_reset(struct fsl_mc_io *mc_io,
 		u32 cmd_flags,
 		u16 token);
 
+int fsl_mc_obj_open(struct fsl_mc_io *mc_io,
+		    u32 cmd_flags,
+		    int obj_id,
+		    char *obj_type,
+		    u16 *token);
+
+int fsl_mc_obj_close(struct fsl_mc_io *mc_io,
+		     u32 cmd_flags,
+		     u16 token);
+
+int fsl_mc_obj_reset(struct fsl_mc_io *mc_io,
+		     u32 cmd_flags,
+		     u16 token);
+
 /**
  * struct dpcon_attr - Structure representing DPCON attributes
  * @id: DPCON object ID
-- 
cgit v1.2.3


From 3d717fad5081b8e3bda76d86907fad95398cbde8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 28 Sep 2021 16:09:45 -0700
Subject: bpf: Replace "want address" users of BPF_CAST_CALL with BPF_CALL_IMM

In order to keep ahead of cases in the kernel where Control Flow
Integrity (CFI) may trip over function call casts, enabling
-Wcast-function-type is helpful. To that end, BPF_CAST_CALL causes
various warnings and is one of the last places in the kernel triggering
this warning.

Most places using BPF_CAST_CALL actually just want a void * to perform
math on. It's not actually performing a call, so just use a different
helper to get the void *, by way of the new BPF_CALL_IMM() helper, which
can clean up a common copy/paste idiom as well.

This change results in no object code difference.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://github.com/KSPP/linux/issues/20
Link: https://lore.kernel.org/lkml/CAEf4Bzb46=-J5Fxc3mMZ8JQPtK1uoE0q6+g6WPz53Cvx=CBEhw@mail.gmail.com
Link: https://lore.kernel.org/bpf/20210928230946.4062144-2-keescook@chromium.org
---
 include/linux/filter.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4a93c12543ee..6c247663d4ce 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -365,13 +365,17 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 #define BPF_CAST_CALL(x)					\
 		((u64 (*)(u64, u64, u64, u64, u64))(x))
 
+/* Convert function address to BPF immediate */
+
+#define BPF_CALL_IMM(x)	((void *)(x) - (void *)__bpf_call_base)
+
 #define BPF_EMIT_CALL(FUNC)					\
 	((struct bpf_insn) {					\
 		.code  = BPF_JMP | BPF_CALL,			\
 		.dst_reg = 0,					\
 		.src_reg = 0,					\
 		.off   = 0,					\
-		.imm   = ((FUNC) - __bpf_call_base) })
+		.imm   = BPF_CALL_IMM(FUNC) })
 
 /* Raw code statement block */
 
-- 
cgit v1.2.3


From 102acbacfd9a96d101abd96d1a7a5bf92b7c3e8e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 28 Sep 2021 16:09:46 -0700
Subject: bpf: Replace callers of BPF_CAST_CALL with proper function typedef

In order to keep ahead of cases in the kernel where Control Flow
Integrity (CFI) may trip over function call casts, enabling
-Wcast-function-type is helpful. To that end, BPF_CAST_CALL causes
various warnings and is one of the last places in the kernel
triggering this warning.

For actual function calls, replace BPF_CAST_CALL() with a typedef, which
captures the same details about the given function pointers.

This change results in no object code difference.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://github.com/KSPP/linux/issues/20
Link: https://lore.kernel.org/lkml/CAEf4Bzb46=-J5Fxc3mMZ8JQPtK1uoE0q6+g6WPz53Cvx=CBEhw@mail.gmail.com
Link: https://lore.kernel.org/bpf/20210928230946.4062144-3-keescook@chromium.org
---
 include/linux/bpf.h    | 4 +++-
 include/linux/filter.h | 5 -----
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b6c45a6cbbba..19735d59230a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -48,6 +48,7 @@ extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
 
+typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
 					struct bpf_iter_aux_info *aux);
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
@@ -142,7 +143,8 @@ struct bpf_map_ops {
 	int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
 					      struct bpf_func_state *caller,
 					      struct bpf_func_state *callee);
-	int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn,
+	int (*map_for_each_callback)(struct bpf_map *map,
+				     bpf_callback_t callback_fn,
 				     void *callback_ctx, u64 flags);
 
 	/* BTF name and id of struct allocated by map_alloc */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6c247663d4ce..47f80adbe744 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -360,11 +360,6 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 		.off   = 0,					\
 		.imm   = TGT })
 
-/* Function call */
-
-#define BPF_CAST_CALL(x)					\
-		((u64 (*)(u64, u64, u64, u64, u64))(x))
-
 /* Convert function address to BPF immediate */
 
 #define BPF_CALL_IMM(x)	((void *)(x) - (void *)__bpf_call_base)
-- 
cgit v1.2.3


From 7c2dcfa295b149a58010632c7eb7e73bd0626a7a Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Tue, 28 Sep 2021 20:45:19 +0200
Subject: net: phy: micrel: Add support for LAN8804 PHY

The LAN8804 PHY has same features as that of LAN8814 PHY except that it
doesn't support 1588, SyncE or Q-USGMII.

This PHY is found inside the LAN966X switches.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/micrel_phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 3d43c60b49fa..1f7c33b2f5a3 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -28,6 +28,7 @@
 #define PHY_ID_KSZ9031		0x00221620
 #define PHY_ID_KSZ9131		0x00221640
 #define PHY_ID_LAN8814		0x00221660
+#define PHY_ID_LAN8804		0x00221670
 
 #define PHY_ID_KSZ886X		0x00221430
 #define PHY_ID_KSZ8863		0x00221435
-- 
cgit v1.2.3


From e81e99bacc9f9347bda7808a949c1ce9fcc2bbf4 Mon Sep 17 00:00:00 2001
From: David Stevens <stevensd@chromium.org>
Date: Wed, 29 Sep 2021 11:32:59 +0900
Subject: swiotlb: Support aligned swiotlb buffers

Add an argument to swiotlb_tbl_map_single that specifies the desired
alignment of the allocated buffer. This is used by dma-iommu to ensure
the buffer is aligned to the iova granule size when using swiotlb with
untrusted sub-granule mappings. This addresses an issue where adjacent
slots could be exposed to the untrusted device if IO_TLB_SIZE < iova
granule < PAGE_SIZE.

Signed-off-by: David Stevens <stevensd@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210929023300.335969-7-stevensd@google.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/swiotlb.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b0cb2a9973f4..569272871375 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -45,7 +45,8 @@ extern void __init swiotlb_update_mem_attributes(void);
 
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
 		size_t mapping_size, size_t alloc_size,
-		enum dma_data_direction dir, unsigned long attrs);
+		unsigned int alloc_aligned_mask, enum dma_data_direction dir,
+		unsigned long attrs);
 
 extern void swiotlb_tbl_unmap_single(struct device *hwdev,
 				     phys_addr_t tlb_addr,
-- 
cgit v1.2.3


From e5af50a5df571c1d0268b02f924de49b742c990f Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Thu, 23 Sep 2021 18:06:55 -0700
Subject: arm64: kasan: mte: move GCR_EL1 switch to task switch when KASAN
 disabled

It is not necessary to write to GCR_EL1 on every kernel entry and
exit when HW tag-based KASAN is disabled because the kernel will not
execute any IRG instructions in that mode. Since accessing GCR_EL1
can be expensive on some microarchitectures, avoid doing so by moving
the access to task switch when HW tag-based KASAN is disabled.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://linux-review.googlesource.com/id/I78e90d60612a94c24344526f476ac4ff216e10d2
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210924010655.2886918-1-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/kasan.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index dd874a1ee862..de5f5913374d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -89,7 +89,7 @@ static __always_inline bool kasan_enabled(void)
 	return static_branch_likely(&kasan_flag_enabled);
 }
 
-static inline bool kasan_has_integrated_init(void)
+static inline bool kasan_hw_tags_enabled(void)
 {
 	return kasan_enabled();
 }
@@ -104,7 +104,7 @@ static inline bool kasan_enabled(void)
 	return IS_ENABLED(CONFIG_KASAN);
 }
 
-static inline bool kasan_has_integrated_init(void)
+static inline bool kasan_hw_tags_enabled(void)
 {
 	return false;
 }
@@ -125,6 +125,11 @@ static __always_inline void kasan_free_pages(struct page *page,
 
 #endif /* CONFIG_KASAN_HW_TAGS */
 
+static inline bool kasan_has_integrated_init(void)
+{
+	return kasan_hw_tags_enabled();
+}
+
 #ifdef CONFIG_KASAN
 
 struct kasan_cache {
-- 
cgit v1.2.3


From 7101c83950e629b83f9d827f288063e52074a6ea Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Wed, 29 Sep 2021 19:23:50 -0700
Subject: platform/chrome: cros_usbpd_notify: Move ec_command()

cros_ec_command() can be used by other modules too. So, move it to a
common location and export it.

This patch does not introduce any functional changes.

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20210930022403.3358070-3-pmalani@chromium.org
---
 include/linux/platform_data/cros_ec_proto.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 55844ece0b32..20b17c43caeb 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -231,6 +231,9 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature);
 
 int cros_ec_get_sensor_count(struct cros_ec_dev *ec);
 
+int cros_ec_command(struct cros_ec_device *ec_dev, int command, uint8_t *outdata, int outsize,
+		    uint8_t *indata, int insize);
+
 /**
  * cros_ec_get_time_ns() - Return time in ns.
  *
-- 
cgit v1.2.3


From 5d122256f4e5900f7f8de5d8787af570314f6701 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Wed, 29 Sep 2021 19:23:52 -0700
Subject: platform/chrome: cros_ec_proto: Make data pointers void

Convert the input and output data pointers for cros_ec_command() to
void pointers so that the callers don't have to cast their custom
structs to uint8_t *.

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20210930022403.3358070-4-pmalani@chromium.org
---
 include/linux/platform_data/cros_ec_proto.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 20b17c43caeb..f833473c5f44 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -231,8 +231,8 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature);
 
 int cros_ec_get_sensor_count(struct cros_ec_dev *ec);
 
-int cros_ec_command(struct cros_ec_device *ec_dev, int command, uint8_t *outdata, int outsize,
-		    uint8_t *indata, int insize);
+int cros_ec_command(struct cros_ec_device *ec_dev, int command, void *outdata, int outsize,
+		    void *indata, int insize);
 
 /**
  * cros_ec_get_time_ns() - Return time in ns.
-- 
cgit v1.2.3


From 4f1406396ed4d97518b8112327bdaf14fc9d4090 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Wed, 29 Sep 2021 19:23:54 -0700
Subject: platform/chrome: cros_ec_proto: Add version for ec_command

Add a version parameter to cros_ec_command() for callers that may want
to specify which version of the host command they would like to use.

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20210930022403.3358070-5-pmalani@chromium.org
---
 include/linux/platform_data/cros_ec_proto.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index f833473c5f44..9d370816a419 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -231,8 +231,8 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature);
 
 int cros_ec_get_sensor_count(struct cros_ec_dev *ec);
 
-int cros_ec_command(struct cros_ec_device *ec_dev, int command, void *outdata, int outsize,
-		    void *indata, int insize);
+int cros_ec_command(struct cros_ec_device *ec_dev, unsigned int version, int command, void *outdata,
+		    int outsize, void *indata, int insize);
 
 /**
  * cros_ec_get_time_ns() - Return time in ns.
-- 
cgit v1.2.3


From 381cecc5d7b777ada7cdf12f5b0bf4caf43bf7aa Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Fri, 3 Sep 2021 09:51:38 +0200
Subject: KVM: Drop 'except' parameter from kvm_make_vcpus_request_mask()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both remaining callers of kvm_make_vcpus_request_mask() pass 'NULL' for
'except' parameter so it can just be dropped.

No functional change intended ©.

Suggested-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210903075141.403071-6-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0f18df7fe874..89e1a0069833 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -160,7 +160,6 @@ static inline bool is_error_page(struct page *page)
 #define KVM_ARCH_REQ(nr)           KVM_ARCH_REQ_FLAGS(nr, 0)
 
 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
-				 struct kvm_vcpu *except,
 				 unsigned long *vcpu_bitmap, cpumask_var_t tmp);
 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
-- 
cgit v1.2.3


From 620b2438abf98f09e19802cbc3bc2e98179cdbe2 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Fri, 3 Sep 2021 09:51:41 +0200
Subject: KVM: Make kvm_make_vcpus_request_mask() use pre-allocated
 cpu_kick_mask

kvm_make_vcpus_request_mask() already disables preemption so just like
kvm_make_all_cpus_request_except() it can be switched to using
pre-allocated per-cpu cpumasks. This allows for improvements for both
users of the function: in Hyper-V emulation code 'tlb_flush' can now be
dropped from 'struct kvm_vcpu_hv' and kvm_make_scan_ioapic_request_mask()
gets rid of dynamic allocation.

cpumask_available() checks in kvm_make_vcpu_request() and
kvm_kick_many_cpus() can now be dropped as they checks for an impossible
condition: kvm_init() makes sure per-cpu masks are allocated.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210903075141.403071-9-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 89e1a0069833..f1b96a2ebaa7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -160,7 +160,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_ARCH_REQ(nr)           KVM_ARCH_REQ_FLAGS(nr, 0)
 
 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
-				 unsigned long *vcpu_bitmap, cpumask_var_t tmp);
+				 unsigned long *vcpu_bitmap);
 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 				      struct kvm_vcpu *except);
-- 
cgit v1.2.3


From a1c42ddedf35dbf5f25ea0982ed6e226eef7a78c Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 13 Sep 2021 15:57:44 +0200
Subject: kvm: rename KVM_MAX_VCPU_ID to KVM_MAX_VCPU_IDS

KVM_MAX_VCPU_ID is not specifying the highest allowed vcpu-id, but the
number of allowed vcpu-ids. This has already led to confusion, so
rename KVM_MAX_VCPU_ID to KVM_MAX_VCPU_IDS to make its semantics more
clear

Suggested-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210913135745.13944-3-jgross@suse.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f1b96a2ebaa7..1f9e80ce4723 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -39,8 +39,8 @@
 #include <asm/kvm_host.h>
 #include <linux/kvm_dirty_ring.h>
 
-#ifndef KVM_MAX_VCPU_ID
-#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
+#ifndef KVM_MAX_VCPU_IDS
+#define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS
 #endif
 
 /*
-- 
cgit v1.2.3


From 515a0c79e7963cc4556ca61516cc09d39e592712 Mon Sep 17 00:00:00 2001
From: "Longpeng(Mike)" <longpeng2@huawei.com>
Date: Fri, 27 Aug 2021 16:00:03 +0800
Subject: kvm: irqfd: avoid update unmodified entries of the routing

All of the irqfds would to be updated when update the irq
routing, it's too expensive if there're too many irqfds.

However we can reduce the cost by avoid some unnecessary
updates. For irqs of MSI type on X86, the update can be
saved if the msi values are not change.

The vfio migration could receives benefit from this optimi-
zaiton. The test VM has 128 vcpus and 8 VF (with 65 vectors
enabled), so the VM has more than 520 irqfds. We mesure the
cost of the vfio_msix_enable (in QEMU, it would set routing
for each irqfd) for each VF, and we can see the total cost
can be significantly reduced.

                Origin         Apply this Patch
1st             8              4
2nd             15             5
3rd             22             6
4th             24             6
5th             36             7
6th             44             7
7th             51             8
8th             58             8
Total           258ms          51ms

We're also tring to optimize the QEMU part [1], but it's still
worth to optimize the KVM to gain more benefits.

[1] https://lists.gnu.org/archive/html/qemu-devel/2021-08/msg04215.html

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
Message-Id: <20210827080003.2689-1-longpeng2@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1f9e80ce4723..3f87d6ad20bf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1764,6 +1764,8 @@ void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 				  uint32_t guest_irq, bool set);
+bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *,
+				  struct kvm_kernel_irq_routing_entry *);
 #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
 
 #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS
-- 
cgit v1.2.3


From bcf9033e5449bdcaa9bed46467a7141a8049dadb Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 14 Sep 2021 14:10:33 +0200
Subject: sched: move CPU field back into thread_info if THREAD_INFO_IN_TASK=y

THREAD_INFO_IN_TASK moved the CPU field out of thread_info, but this
causes some issues on architectures that define raw_smp_processor_id()
in terms of this field, due to the fact that #include'ing linux/sched.h
to get at struct task_struct is problematic in terms of circular
dependencies.

Given that thread_info and task_struct are the same data structure
anyway when THREAD_INFO_IN_TASK=y, let's move it back so that having
access to the type definition of struct thread_info is sufficient to
reference the CPU number of the current task.

Note that this requires THREAD_INFO_IN_TASK's definition of the
task_thread_info() helper to be updated, as task_cpu() takes a
pointer-to-const, whereas task_thread_info() (which is used to generate
lvalues as well), needs a non-const pointer. So make it a macro instead.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/sched.h | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 39039ce8ac4c..8699594e3f99 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -750,10 +750,6 @@ struct task_struct {
 #ifdef CONFIG_SMP
 	int				on_cpu;
 	struct __call_single_node	wake_entry;
-#ifdef CONFIG_THREAD_INFO_IN_TASK
-	/* Current CPU: */
-	unsigned int			cpu;
-#endif
 	unsigned int			wakee_flips;
 	unsigned long			wakee_flip_decay_ts;
 	struct task_struct		*last_wakee;
@@ -1886,10 +1882,7 @@ extern struct thread_info init_thread_info;
 extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
-static inline struct thread_info *task_thread_info(struct task_struct *task)
-{
-	return &task->thread_info;
-}
+# define task_thread_info(task)	(&(task)->thread_info)
 #elif !defined(__HAVE_THREAD_FUNCTIONS)
 # define task_thread_info(task)	((struct thread_info *)(task)->stack)
 #endif
@@ -2114,11 +2107,7 @@ static __always_inline bool need_resched(void)
 
 static inline unsigned int task_cpu(const struct task_struct *p)
 {
-#ifdef CONFIG_THREAD_INFO_IN_TASK
-	return READ_ONCE(p->cpu);
-#else
 	return READ_ONCE(task_thread_info(p)->cpu);
-#endif
 }
 
 extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
-- 
cgit v1.2.3


From 38a68934aa72459217986cf6b461d87f7602648a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 24 Sep 2021 17:56:51 +0200
Subject: vfio: Move vfio_iommu_group_get() to vfio_register_group_dev()

We don't need to hold a reference to the group in the driver as well as
obtain a reference to the same group as the first thing
vfio_register_group_dev() does.

Since the drivers never use the group move this all into the core code.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210924155705.4258-2-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index b53a9557884a..f7083c2fd0d0 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -71,9 +71,6 @@ struct vfio_device_ops {
 	int	(*match)(struct vfio_device *vdev, char *buf);
 };
 
-extern struct iommu_group *vfio_iommu_group_get(struct device *dev);
-extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev);
-
 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 			 const struct vfio_device_ops *ops);
 void vfio_uninit_group_dev(struct vfio_device *device);
-- 
cgit v1.2.3


From c68ea0d00ad82428154aed890ec9f793e460fa1c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Sep 2021 17:56:57 +0200
Subject: vfio: simplify iommu group allocation for mediated devices

Reuse the logic in vfio_noiommu_group_alloc to allocate a fake
single-device iommu group for mediated devices by factoring out a common
function, and replacing the noiommu boolean field in struct vfio_group
with an enum to distinguish the three different kinds of groups.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210924155705.4258-8-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index f7083c2fd0d0..bbe293008626 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -75,6 +75,7 @@ void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 			 const struct vfio_device_ops *ops);
 void vfio_uninit_group_dev(struct vfio_device *device);
 int vfio_register_group_dev(struct vfio_device *device);
+int vfio_register_emulated_iommu_dev(struct vfio_device *device);
 void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
-- 
cgit v1.2.3


From 67462037872d5ca57dc4674cccff191947b9b43e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Sep 2021 17:56:58 +0200
Subject: vfio: remove unused method from vfio_iommu_driver_ops

The read, write and mmap methods are never implemented, so remove them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210924155705.4258-9-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index bbe293008626..7a57a0077f96 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -95,13 +95,8 @@ struct vfio_iommu_driver_ops {
 	struct module	*owner;
 	void		*(*open)(unsigned long arg);
 	void		(*release)(void *iommu_data);
-	ssize_t		(*read)(void *iommu_data, char __user *buf,
-				size_t count, loff_t *ppos);
-	ssize_t		(*write)(void *iommu_data, const char __user *buf,
-				 size_t count, loff_t *size);
 	long		(*ioctl)(void *iommu_data, unsigned int cmd,
 				 unsigned long arg);
-	int		(*mmap)(void *iommu_data, struct vm_area_struct *vma);
 	int		(*attach_group)(void *iommu_data,
 					struct iommu_group *group);
 	void		(*detach_group)(void *iommu_data,
-- 
cgit v1.2.3


From 8cc02d22d7e1596ed687c4ff967c32056c2bef3e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Sep 2021 17:56:59 +0200
Subject: vfio: move the vfio_iommu_driver_ops interface out of <linux/vfio.h>

Create a new private drivers/vfio/vfio.h header for the interface between
the VFIO core and the iommu drivers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210924155705.4258-10-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 44 --------------------------------------------
 1 file changed, 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 7a57a0077f96..76191d7abed1 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -82,50 +82,6 @@ extern void vfio_device_put(struct vfio_device *device);
 
 int vfio_assign_device_set(struct vfio_device *device, void *set_id);
 
-/* events for the backend driver notify callback */
-enum vfio_iommu_notify_type {
-	VFIO_IOMMU_CONTAINER_CLOSE = 0,
-};
-
-/**
- * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
- */
-struct vfio_iommu_driver_ops {
-	char		*name;
-	struct module	*owner;
-	void		*(*open)(unsigned long arg);
-	void		(*release)(void *iommu_data);
-	long		(*ioctl)(void *iommu_data, unsigned int cmd,
-				 unsigned long arg);
-	int		(*attach_group)(void *iommu_data,
-					struct iommu_group *group);
-	void		(*detach_group)(void *iommu_data,
-					struct iommu_group *group);
-	int		(*pin_pages)(void *iommu_data,
-				     struct iommu_group *group,
-				     unsigned long *user_pfn,
-				     int npage, int prot,
-				     unsigned long *phys_pfn);
-	int		(*unpin_pages)(void *iommu_data,
-				       unsigned long *user_pfn, int npage);
-	int		(*register_notifier)(void *iommu_data,
-					     unsigned long *events,
-					     struct notifier_block *nb);
-	int		(*unregister_notifier)(void *iommu_data,
-					       struct notifier_block *nb);
-	int		(*dma_rw)(void *iommu_data, dma_addr_t user_iova,
-				  void *data, size_t count, bool write);
-	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
-						   struct iommu_group *group);
-	void		(*notify)(void *iommu_data,
-				  enum vfio_iommu_notify_type event);
-};
-
-extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
-
-extern void vfio_unregister_iommu_driver(
-				const struct vfio_iommu_driver_ops *ops);
-
 /*
  * External user API
  */
-- 
cgit v1.2.3


From fda49d97f2c4faf0b42e8796d3e6c868d992f3af Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Sep 2021 17:57:00 +0200
Subject: vfio: remove the unused mdev iommu hook

The iommu_device field in struct mdev_device has never been used
since it was added more than 2 years ago.

This is a manual revert of commit 7bd50f0cd2
("vfio/type1: Add domain at(de)taching group helpers").

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210924155705.4258-11-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 68427e8fadeb..15d03f6532d0 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -18,7 +18,6 @@ struct mdev_device {
 	void *driver_data;
 	struct list_head next;
 	struct mdev_type *type;
-	struct device *iommu_device;
 	bool active;
 };
 
@@ -27,25 +26,6 @@ static inline struct mdev_device *to_mdev_device(struct device *dev)
 	return container_of(dev, struct mdev_device, dev);
 }
 
-/*
- * Called by the parent device driver to set the device which represents
- * this mdev in iommu protection scope. By default, the iommu device is
- * NULL, that indicates using vendor defined isolation.
- *
- * @dev: the mediated device that iommu will isolate.
- * @iommu_device: a pci device which represents the iommu for @dev.
- */
-static inline void mdev_set_iommu_device(struct mdev_device *mdev,
-					 struct device *iommu_device)
-{
-	mdev->iommu_device = iommu_device;
-}
-
-static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev)
-{
-	return mdev->iommu_device;
-}
-
 unsigned int mdev_get_type_group_id(struct mdev_device *mdev);
 unsigned int mtype_get_type_group_id(struct mdev_type *mtype);
 struct device *mtype_get_parent_dev(struct mdev_type *mtype);
-- 
cgit v1.2.3


From 02afb8d6048d6526619e6e2dcdc95ce9c2bdb52f Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punitagrawal@gmail.com>
Date: Tue, 14 Sep 2021 23:38:57 +0900
Subject: kprobe: Simplify prepare_kprobe() by dropping redundant version

The function prepare_kprobe() is called during kprobe registration and
is responsible for ensuring any architecture related preparation for
the kprobe is done before returning.

One of two versions of prepare_kprobe() is chosen depending on the
availability of KPROBE_ON_FTRACE in the kernel configuration.

Simplify the code by dropping the version when KPROBE_ON_FTRACE is not
selected - instead relying on kprobe_ftrace() to return false when
KPROBE_ON_FTRACE is not set.

No functional change.

Link: https://lkml.kernel.org/r/163163033696.489837.9264661820279300788.stgit@devnote2

Signed-off-by: Punit Agrawal <punitagrawal@gmail.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e4f3bfe08757..0b75549b2815 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -354,6 +354,11 @@ static inline void wait_for_kprobe_optimizer(void) { }
 extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 				  struct ftrace_ops *ops, struct ftrace_regs *fregs);
 extern int arch_prepare_kprobe_ftrace(struct kprobe *p);
+#else
+static inline int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+	return -EINVAL;
+}
 #endif
 
 int arch_check_ftrace_location(struct kprobe *p);
-- 
cgit v1.2.3


From 4402deae8993fb0e25a19bb999b38df13e25a7e0 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punitagrawal@gmail.com>
Date: Tue, 14 Sep 2021 23:39:16 +0900
Subject: kprobes: Make arch_check_ftrace_location static

arch_check_ftrace_location() was introduced as a weak function in
commit f7f242ff004499 ("kprobes: introduce weak
arch_check_ftrace_location() helper function") to allow architectures
to handle kprobes call site on their own.

Recently, the only architecture (csky) to implement
arch_check_ftrace_location() was migrated to using the common
version.

As a result, further cleanup the code to drop the weak attribute and
rename the function to remove the architecture specific
implementation.

Link: https://lkml.kernel.org/r/163163035673.489837.2367816318195254104.stgit@devnote2

Signed-off-by: Punit Agrawal <punitagrawal@gmail.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 0b75549b2815..8a9412bb0d5e 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -361,8 +361,6 @@ static inline int arch_prepare_kprobe_ftrace(struct kprobe *p)
 }
 #endif
 
-int arch_check_ftrace_location(struct kprobe *p);
-
 /* Get the kprobe at this addr (if any) - called with preemption disabled */
 struct kprobe *get_kprobe(void *addr);
 
-- 
cgit v1.2.3


From 223a76b268c9cfa265d454879ae09e2c9c808f87 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 14 Sep 2021 23:39:34 +0900
Subject: kprobes: Fix coding style issues

Fix coding style issues reported by checkpatch.pl and update
comments to quote variable names and add "()" to function
name.
One TODO comment in __disarm_kprobe() is removed because
it has been done by following commit.

Link: https://lkml.kernel.org/r/163163037468.489837.4282347782492003960.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 8a9412bb0d5e..756d3d23ce37 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -3,7 +3,6 @@
 #define _LINUX_KPROBES_H
 /*
  *  Kernel Probes (KProbes)
- *  include/linux/kprobes.h
  *
  * Copyright (C) IBM Corporation, 2002, 2004
  *
@@ -39,7 +38,7 @@
 #define KPROBE_REENTER		0x00000004
 #define KPROBE_HIT_SSDONE	0x00000008
 
-#else /* CONFIG_KPROBES */
+#else /* !CONFIG_KPROBES */
 #include <asm-generic/kprobes.h>
 typedef int kprobe_opcode_t;
 struct arch_specific_insn {
@@ -228,7 +227,7 @@ static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance
 	return READ_ONCE(ri->rph->rp);
 }
 
-#else /* CONFIG_KRETPROBES */
+#else /* !CONFIG_KRETPROBES */
 static inline void arch_prepare_kretprobe(struct kretprobe *rp,
 					struct pt_regs *regs)
 {
@@ -239,11 +238,15 @@ static inline int arch_trampoline_kprobe(struct kprobe *p)
 }
 #endif /* CONFIG_KRETPROBES */
 
+/* Markers of '_kprobe_blacklist' section */
+extern unsigned long __start_kprobe_blacklist[];
+extern unsigned long __stop_kprobe_blacklist[];
+
 extern struct kretprobe_blackpoint kretprobe_blacklist[];
 
 #ifdef CONFIG_KPROBES_SANITY_TEST
 extern int init_test_probes(void);
-#else
+#else /* !CONFIG_KPROBES_SANITY_TEST */
 static inline int init_test_probes(void)
 {
 	return 0;
@@ -303,7 +306,7 @@ static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
 #define KPROBE_OPTINSN_PAGE_SYM		"kprobe_optinsn_page"
 int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
 			     unsigned long *value, char *type, char *sym);
-#else /* __ARCH_WANT_KPROBES_INSN_SLOT */
+#else /* !__ARCH_WANT_KPROBES_INSN_SLOT */
 #define DEFINE_INSN_CACHE_OPS(__name)					\
 static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
 {									\
@@ -345,11 +348,12 @@ extern int sysctl_kprobes_optimization;
 extern int proc_kprobes_optimization_handler(struct ctl_table *table,
 					     int write, void *buffer,
 					     size_t *length, loff_t *ppos);
-#endif
+#endif /* CONFIG_SYSCTL */
 extern void wait_for_kprobe_optimizer(void);
-#else
+#else /* !CONFIG_OPTPROBES */
 static inline void wait_for_kprobe_optimizer(void) { }
 #endif /* CONFIG_OPTPROBES */
+
 #ifdef CONFIG_KPROBES_ON_FTRACE
 extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 				  struct ftrace_ops *ops, struct ftrace_regs *fregs);
@@ -359,7 +363,7 @@ static inline int arch_prepare_kprobe_ftrace(struct kprobe *p)
 {
 	return -EINVAL;
 }
-#endif
+#endif /* CONFIG_KPROBES_ON_FTRACE */
 
 /* Get the kprobe at this addr (if any) - called with preemption disabled */
 struct kprobe *get_kprobe(void *addr);
@@ -367,7 +371,7 @@ struct kprobe *get_kprobe(void *addr);
 /* kprobe_running() will just return the current_kprobe on this CPU */
 static inline struct kprobe *kprobe_running(void)
 {
-	return (__this_cpu_read(current_kprobe));
+	return __this_cpu_read(current_kprobe);
 }
 
 static inline void reset_current_kprobe(void)
@@ -431,11 +435,11 @@ static inline struct kprobe *kprobe_running(void)
 }
 static inline int register_kprobe(struct kprobe *p)
 {
-	return -ENOSYS;
+	return -EOPNOTSUPP;
 }
 static inline int register_kprobes(struct kprobe **kps, int num)
 {
-	return -ENOSYS;
+	return -EOPNOTSUPP;
 }
 static inline void unregister_kprobe(struct kprobe *p)
 {
@@ -445,11 +449,11 @@ static inline void unregister_kprobes(struct kprobe **kps, int num)
 }
 static inline int register_kretprobe(struct kretprobe *rp)
 {
-	return -ENOSYS;
+	return -EOPNOTSUPP;
 }
 static inline int register_kretprobes(struct kretprobe **rps, int num)
 {
-	return -ENOSYS;
+	return -EOPNOTSUPP;
 }
 static inline void unregister_kretprobe(struct kretprobe *rp)
 {
@@ -465,11 +469,11 @@ static inline void kprobe_free_init_mem(void)
 }
 static inline int disable_kprobe(struct kprobe *kp)
 {
-	return -ENOSYS;
+	return -EOPNOTSUPP;
 }
 static inline int enable_kprobe(struct kprobe *kp)
 {
-	return -ENOSYS;
+	return -EOPNOTSUPP;
 }
 
 static inline bool within_kprobe_blacklist(unsigned long addr)
@@ -482,6 +486,7 @@ static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value,
 	return -ERANGE;
 }
 #endif /* CONFIG_KPROBES */
+
 static inline int disable_kretprobe(struct kretprobe *rp)
 {
 	return disable_kprobe(&rp->kp);
@@ -496,13 +501,14 @@ static inline bool is_kprobe_insn_slot(unsigned long addr)
 {
 	return false;
 }
-#endif
+#endif /* !CONFIG_KPROBES */
+
 #ifndef CONFIG_OPTPROBES
 static inline bool is_kprobe_optinsn_slot(unsigned long addr)
 {
 	return false;
 }
-#endif
+#endif /* !CONFIG_OPTPROBES */
 
 /* Returns true if kprobes handled the fault */
 static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs,
-- 
cgit v1.2.3


From dfc05b55c3c6b15dbd889e9901ecb3fb695421bd Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 14 Sep 2021 23:39:46 +0900
Subject: kprobes: Use IS_ENABLED() instead of kprobes_built_in()

Use IS_ENABLED(CONFIG_KPROBES) instead of kprobes_built_in().
This inline function is introduced only for avoiding #ifdef.
But since now we have IS_ENABLED(), it is no longer needed.

Link: https://lkml.kernel.org/r/163163038581.489837.2805250706507372658.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 756d3d23ce37..9c28fbb18e74 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -180,14 +180,6 @@ struct kprobe_blacklist_entry {
 DECLARE_PER_CPU(struct kprobe *, current_kprobe);
 DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
-/*
- * For #ifdef avoidance:
- */
-static inline int kprobes_built_in(void)
-{
-	return 1;
-}
-
 extern void kprobe_busy_begin(void);
 extern void kprobe_busy_end(void);
 
@@ -417,10 +409,6 @@ int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
 			    char *type, char *sym);
 #else /* !CONFIG_KPROBES: */
 
-static inline int kprobes_built_in(void)
-{
-	return 0;
-}
 static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 {
 	return 0;
@@ -514,7 +502,7 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr)
 static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs,
 					      unsigned int trap)
 {
-	if (!kprobes_built_in())
+	if (!IS_ENABLED(CONFIG_KPROBES))
 		return false;
 	if (user_mode(regs))
 		return false;
-- 
cgit v1.2.3


From c42421e205fc2570a4d019184ea7d6c382c93f4c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 14 Sep 2021 23:40:07 +0900
Subject: kprobes: treewide: Use 'kprobe_opcode_t *' for the code address in
 get_optimized_kprobe()

Since get_optimized_kprobe() is only used inside kprobes,
it doesn't need to use 'unsigned long' type for 'addr' parameter.
Make it use 'kprobe_opcode_t *' for the 'addr' parameter and
subsequent call of arch_within_optimized_kprobe() also should use
'kprobe_opcode_t *'.

Note that MAX_OPTIMIZED_LENGTH and RELATIVEJUMP_SIZE are defined
by byte-size, but the size of 'kprobe_opcode_t' depends on the
architecture. Therefore, we must be careful when calculating
addresses using those macros.

Link: https://lkml.kernel.org/r/163163040680.489837.12133032364499833736.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 9c28fbb18e74..6a5995f334a0 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -329,7 +329,7 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist,
 				    struct list_head *done_list);
 extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
 extern int arch_within_optimized_kprobe(struct optimized_kprobe *op,
-					unsigned long addr);
+					kprobe_opcode_t *addr);
 
 extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs);
 
-- 
cgit v1.2.3


From 29e8077ae2beea6a85ad2d0bae9c550bd5d05ed9 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 14 Sep 2021 23:40:16 +0900
Subject: kprobes: Use bool type for functions which returns boolean value

Use the 'bool' type instead of 'int' for the functions which
returns a boolean value, because this makes clear that those
functions don't return any error code.

Link: https://lkml.kernel.org/r/163163041649.489837.17311187321419747536.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 6a5995f334a0..0ba3f9e316d4 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -104,25 +104,25 @@ struct kprobe {
 #define KPROBE_FLAG_FTRACE	8 /* probe is using ftrace */
 
 /* Has this kprobe gone ? */
-static inline int kprobe_gone(struct kprobe *p)
+static inline bool kprobe_gone(struct kprobe *p)
 {
 	return p->flags & KPROBE_FLAG_GONE;
 }
 
 /* Is this kprobe disabled ? */
-static inline int kprobe_disabled(struct kprobe *p)
+static inline bool kprobe_disabled(struct kprobe *p)
 {
 	return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE);
 }
 
 /* Is this kprobe really running optimized path ? */
-static inline int kprobe_optimized(struct kprobe *p)
+static inline bool kprobe_optimized(struct kprobe *p)
 {
 	return p->flags & KPROBE_FLAG_OPTIMIZED;
 }
 
 /* Is this kprobe uses ftrace ? */
-static inline int kprobe_ftrace(struct kprobe *p)
+static inline bool kprobe_ftrace(struct kprobe *p)
 {
 	return p->flags & KPROBE_FLAG_FTRACE;
 }
-- 
cgit v1.2.3


From f2ec8d9a3b8c0f22cd6a2b4f5a2d9aee5206e3b7 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 14 Sep 2021 23:40:36 +0900
Subject: kprobes: treewide: Replace arch_deref_entry_point() with
 dereference_symbol_descriptor()

~15 years ago kprobes grew the 'arch_deref_entry_point()' __weak function:

  3d7e33825d87: ("jprobes: make jprobes a little safer for users")

But this is just open-coded dereference_symbol_descriptor() in essence, and
its obscure nature was causing bugs.

Just use the real thing and remove arch_deref_entry_point().

Link: https://lkml.kernel.org/r/163163043630.489837.7924988885652708696.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Tested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 0ba3f9e316d4..2ed61fcbc89c 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -381,7 +381,6 @@ int register_kprobe(struct kprobe *p);
 void unregister_kprobe(struct kprobe *p);
 int register_kprobes(struct kprobe **kps, int num);
 void unregister_kprobes(struct kprobe **kps, int num);
-unsigned long arch_deref_entry_point(void *);
 
 int register_kretprobe(struct kretprobe *rp);
 void unregister_kretprobe(struct kretprobe *rp);
-- 
cgit v1.2.3


From 96fed8ac2bb64ab45497fdd8e3d390165b7a9be8 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 14 Sep 2021 23:40:45 +0900
Subject: kprobes: treewide: Remove trampoline_address from
 kretprobe_trampoline_handler()

The __kretprobe_trampoline_handler() callback, called from low level
arch kprobes methods, has the 'trampoline_address' parameter, which is
entirely superfluous as it basically just replicates:

  dereference_kernel_function_descriptor(kretprobe_trampoline)

In fact we had bugs in arch code where it wasn't replicated correctly.

So remove this superfluous parameter and use kretprobe_trampoline_addr()
instead.

Link: https://lkml.kernel.org/r/163163044546.489837.13505751885476015002.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Tested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 2ed61fcbc89c..96f5df93e36e 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -188,15 +188,23 @@ extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				   struct pt_regs *regs);
 extern int arch_trampoline_kprobe(struct kprobe *p);
 
+void kretprobe_trampoline(void);
+/*
+ * Since some architecture uses structured function pointer,
+ * use dereference_function_descriptor() to get real function address.
+ */
+static nokprobe_inline void *kretprobe_trampoline_addr(void)
+{
+	return dereference_kernel_function_descriptor(kretprobe_trampoline);
+}
+
 /* If the trampoline handler called from a kprobe, use this version */
 unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
-				void *trampoline_address,
-				void *frame_pointer);
+					     void *frame_pointer);
 
 static nokprobe_inline
 unsigned long kretprobe_trampoline_handler(struct pt_regs *regs,
-				void *trampoline_address,
-				void *frame_pointer)
+					   void *frame_pointer)
 {
 	unsigned long ret;
 	/*
@@ -205,7 +213,7 @@ unsigned long kretprobe_trampoline_handler(struct pt_regs *regs,
 	 * be running at this point.
 	 */
 	kprobe_busy_begin();
-	ret = __kretprobe_trampoline_handler(regs, trampoline_address, frame_pointer);
+	ret = __kretprobe_trampoline_handler(regs, frame_pointer);
 	kprobe_busy_end();
 
 	return ret;
-- 
cgit v1.2.3


From adf8a61a940c49fea6fab9c3865f2b69b8ceef28 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 14 Sep 2021 23:40:54 +0900
Subject: kprobes: treewide: Make it harder to refer kretprobe_trampoline
 directly

Since now there is kretprobe_trampoline_addr() for referring the
address of kretprobe trampoline code, we don't need to access
kretprobe_trampoline directly.

Make it harder to refer by renaming it to __kretprobe_trampoline().

Link: https://lkml.kernel.org/r/163163045446.489837.14510577516938803097.stgit@devnote2

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 96f5df93e36e..b6b2370f4a4c 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -188,14 +188,14 @@ extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				   struct pt_regs *regs);
 extern int arch_trampoline_kprobe(struct kprobe *p);
 
-void kretprobe_trampoline(void);
+void __kretprobe_trampoline(void);
 /*
  * Since some architecture uses structured function pointer,
  * use dereference_function_descriptor() to get real function address.
  */
 static nokprobe_inline void *kretprobe_trampoline_addr(void)
 {
-	return dereference_kernel_function_descriptor(kretprobe_trampoline);
+	return dereference_kernel_function_descriptor(__kretprobe_trampoline);
 }
 
 /* If the trampoline handler called from a kprobe, use this version */
-- 
cgit v1.2.3


From 03bac0df2886882c43e6d0bfff9dee84a184fc7e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 14 Sep 2021 23:41:04 +0900
Subject: kprobes: Add kretprobe_find_ret_addr() for searching return address

Introduce kretprobe_find_ret_addr() and is_kretprobe_trampoline().
These APIs will be used by the ORC stack unwinder and ftrace, so that
they can check whether the given address points kretprobe trampoline
code and query the correct return address in that case.

Link: https://lkml.kernel.org/r/163163046461.489837.1044778356430293962.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Tested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index b6b2370f4a4c..6d47a9da1e0a 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -505,6 +505,28 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr)
 }
 #endif /* !CONFIG_OPTPROBES */
 
+#ifdef CONFIG_KRETPROBES
+static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr)
+{
+	return (void *)addr == kretprobe_trampoline_addr();
+}
+
+unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
+				      struct llist_node **cur);
+#else
+static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr)
+{
+	return false;
+}
+
+static nokprobe_inline
+unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
+				      struct llist_node **cur)
+{
+	return 0;
+}
+#endif
+
 /* Returns true if kprobes handled the fault */
 static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs,
 					      unsigned int trap)
-- 
cgit v1.2.3


From e028c4f7ac7ca8c96126fe46c54ab3d56ffe6a66 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 14 Sep 2021 23:41:13 +0900
Subject: objtool: Add frame-pointer-specific function ignore

Add a CONFIG_FRAME_POINTER-specific version of
STACK_FRAME_NON_STANDARD() for the case where a function is
intentionally missing frame pointer setup, but otherwise needs
objtool/ORC coverage when frame pointers are disabled.

Link: https://lkml.kernel.org/r/163163047364.489837.17377799909553689661.stgit@devnote2

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Tested-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/objtool.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 7e72d975cb76..aca52db2f3f3 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -66,6 +66,17 @@ struct unwind_hint {
 	static void __used __section(".discard.func_stack_frame_non_standard") \
 		*__func_stack_frame_non_standard_##func = func
 
+/*
+ * STACK_FRAME_NON_STANDARD_FP() is a frame-pointer-specific function ignore
+ * for the case where a function is intentionally missing frame pointer setup,
+ * but otherwise needs objtool/ORC coverage when frame pointers are disabled.
+ */
+#ifdef CONFIG_FRAME_POINTER
+#define STACK_FRAME_NON_STANDARD_FP(func) STACK_FRAME_NON_STANDARD(func)
+#else
+#define STACK_FRAME_NON_STANDARD_FP(func)
+#endif
+
 #else /* __ASSEMBLY__ */
 
 /*
@@ -127,6 +138,7 @@ struct unwind_hint {
 #define UNWIND_HINT(sp_reg, sp_offset, type, end)	\
 	"\n\t"
 #define STACK_FRAME_NON_STANDARD(func)
+#define STACK_FRAME_NON_STANDARD_FP(func)
 #else
 #define ANNOTATE_INTRA_FUNCTION_CALL
 .macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0
-- 
cgit v1.2.3


From bf094cffea2a6503ce84062f9f0243bef77c58f9 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 14 Sep 2021 23:42:51 +0900
Subject: x86/kprobes: Fixup return address in generic trampoline handler

In x86, the fake return address on the stack saved by
__kretprobe_trampoline() will be replaced with the real return
address after returning from trampoline_handler(). Before fixing
the return address, the real return address can be found in the
'current->kretprobe_instances'.

However, since there is a window between updating the
'current->kretprobe_instances' and fixing the address on the stack,
if an interrupt happens at that timing and the interrupt handler
does stacktrace, it may fail to unwind because it can not get
the correct return address from 'current->kretprobe_instances'.

This will eliminate that window by fixing the return address
right before updating 'current->kretprobe_instances'.

Link: https://lkml.kernel.org/r/163163057094.489837.9044470370440745866.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Tested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 6d47a9da1e0a..e974caf39d3e 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -188,6 +188,9 @@ extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				   struct pt_regs *regs);
 extern int arch_trampoline_kprobe(struct kprobe *p);
 
+void arch_kretprobe_fixup_return(struct pt_regs *regs,
+				 kprobe_opcode_t *correct_ret_addr);
+
 void __kretprobe_trampoline(void);
 /*
  * Since some architecture uses structured function pointer,
-- 
cgit v1.2.3


From 6b51b02a3a0ac49dfe302818d0746a799545e4e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Tue, 15 Jun 2021 13:12:33 +0200
Subject: dma-buf: fix and rework dma_buf_poll v7
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Daniel pointed me towards this function and there are multiple obvious problems
in the implementation.

First of all the retry loop is not working as intended. In general the retry
makes only sense if you grab the reference first and then check the sequence
values.

Then we should always also wait for the exclusive fence.

It's also good practice to keep the reference around when installing callbacks
to fences you don't own.

And last the whole implementation was unnecessary complex and rather hard to
understand which could lead to probably unexpected behavior of the IOCTL.

Fix all this by reworking the implementation from scratch. Dropping the
whole RCU approach and taking the lock instead.

Only mildly tested and needs a thoughtful review of the code.

Pushing through drm-misc-next to avoid merge conflicts and give the code
another round of testing.

v2: fix the reference counting as well
v3: keep the excl fence handling as is for stable
v4: back to testing all fences, drop RCU
v5: handle in and out separately
v6: add missing clear of events
v7: change coding style as suggested by Michel, drop unused variables

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Tested-by: Michel Dänzer <mdaenzer@redhat.com>
CC: stable@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20210720131110.88512-1-christian.koenig@amd.com
---
 include/linux/dma-buf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 66470c37e471..02c2eb874da6 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -440,7 +440,7 @@ struct dma_buf {
 		wait_queue_head_t *poll;
 
 		__poll_t active;
-	} cb_excl, cb_shared;
+	} cb_in, cb_out;
 #ifdef CONFIG_DMABUF_SYSFS_STATS
 	/**
 	 * @sysfs_entry:
-- 
cgit v1.2.3


From 78b497f2e62d8c7514de5f83c80837bbb120e93e Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 3 Sep 2021 15:08:05 +0200
Subject: kvm: use kvfree() in kvm_arch_free_vm()

By switching from kfree() to kvfree() in kvm_arch_free_vm() Arm64 can
use the common variant. This can be accomplished by adding another
macro __KVM_HAVE_ARCH_VM_FREE, which will be used only by x86 for now.

Further simplification can be achieved by adding __kvm_arch_free_vm()
doing the common part.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Message-Id: <20210903130808.30142-5-jgross@suse.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3f87d6ad20bf..60a35d9fe259 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1081,10 +1081,17 @@ static inline struct kvm *kvm_arch_alloc_vm(void)
 {
 	return kzalloc(sizeof(struct kvm), GFP_KERNEL);
 }
+#endif
+
+static inline void __kvm_arch_free_vm(struct kvm *kvm)
+{
+	kvfree(kvm);
+}
 
+#ifndef __KVM_HAVE_ARCH_VM_FREE
 static inline void kvm_arch_free_vm(struct kvm *kvm)
 {
-	kfree(kvm);
+	__kvm_arch_free_vm(kvm);
 }
 #endif
 
-- 
cgit v1.2.3


From 874f670e6088d3bff3972ecd44c1cb00610f9183 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 23 Sep 2021 18:54:35 +0200
Subject: sched: Clean up the might_sleep() underscore zoo

__might_sleep() vs. ___might_sleep() is hard to distinguish. Aside of that
the three underscore variant is exposed to provide a checkpoint for
rescheduling points which are distinct from blocking points.

They are semantically a preemption point which means that scheduling is
state preserving. A real blocking operation, e.g. mutex_lock(), wait*(),
which cannot preserve a task state which is not equal to RUNNING.

While technically blocking on a "sleeping" spinlock in RT enabled kernels
falls into the voluntary scheduling category because it has to wait until
the contended spin/rw lock becomes available, the RT lock substitution code
can semantically be mapped to a voluntary preemption because the RT lock
substitution code and the scheduler are providing mechanisms to preserve
the task state and to take regular non-lock related wakeups into account.

Rename ___might_sleep() to __might_resched() to make the distinction of
these functions clear.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210923165357.928693482@linutronix.de
---
 include/linux/kernel.h | 6 +++---
 include/linux/sched.h  | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2776423a587e..5e4ae54da73e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -111,7 +111,7 @@ static __always_inline void might_resched(void)
 #endif /* CONFIG_PREEMPT_* */
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-extern void ___might_sleep(const char *file, int line, int preempt_offset);
+extern void __might_resched(const char *file, int line, int preempt_offset);
 extern void __might_sleep(const char *file, int line, int preempt_offset);
 extern void __cant_sleep(const char *file, int line, int preempt_offset);
 extern void __cant_migrate(const char *file, int line);
@@ -168,8 +168,8 @@ extern void __cant_migrate(const char *file, int line);
  */
 # define non_block_end() WARN_ON(current->non_block_count-- == 0)
 #else
-  static inline void ___might_sleep(const char *file, int line,
-				   int preempt_offset) { }
+  static inline void __might_resched(const char *file, int line,
+				     int preempt_offset) { }
   static inline void __might_sleep(const char *file, int line,
 				   int preempt_offset) { }
 # define might_sleep() do { might_resched(); } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e12b524426b0..b38f002334d5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2038,7 +2038,7 @@ static inline int _cond_resched(void) { return 0; }
 #endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
 
 #define cond_resched() ({			\
-	___might_sleep(__FILE__, __LINE__, 0);	\
+	__might_resched(__FILE__, __LINE__, 0);	\
 	_cond_resched();			\
 })
 
@@ -2046,9 +2046,9 @@ extern int __cond_resched_lock(spinlock_t *lock);
 extern int __cond_resched_rwlock_read(rwlock_t *lock);
 extern int __cond_resched_rwlock_write(rwlock_t *lock);
 
-#define cond_resched_lock(lock) ({				\
-	___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
-	__cond_resched_lock(lock);				\
+#define cond_resched_lock(lock) ({					\
+	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
+	__cond_resched_lock(lock);					\
 })
 
 #define cond_resched_rwlock_read(lock) ({			\
-- 
cgit v1.2.3


From 7b5ff4bb9adc53cfbf7ac9ba7820ccf0cd7c070a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 23 Sep 2021 18:54:37 +0200
Subject: sched: Make cond_resched_*lock() variants consistent vs.
 might_sleep()

Commit 3427445afd26 ("sched: Exclude cond_resched() from nested sleep
test") removed the task state check of __might_sleep() for
cond_resched_lock() because cond_resched_lock() is not a voluntary
scheduling point which blocks. It's a preemption point which requires the
lock holder to release the spin lock.

The same rationale applies to cond_resched_rwlock_read/write(), but those
were not touched.

Make it consistent and use the non-state checking __might_resched() there
as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210923165357.991262778@linutronix.de
---
 include/linux/sched.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b38f002334d5..7a989f2487f8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2051,14 +2051,14 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
 	__cond_resched_lock(lock);					\
 })
 
-#define cond_resched_rwlock_read(lock) ({			\
-	__might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
-	__cond_resched_rwlock_read(lock);			\
+#define cond_resched_rwlock_read(lock) ({				\
+	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
+	__cond_resched_rwlock_read(lock);				\
 })
 
-#define cond_resched_rwlock_write(lock) ({			\
-	__might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
-	__cond_resched_rwlock_write(lock);			\
+#define cond_resched_rwlock_write(lock) ({				\
+	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
+	__cond_resched_rwlock_write(lock);				\
 })
 
 static inline void cond_resched_rcu(void)
-- 
cgit v1.2.3


From 42a387566c567603bafa1ec0c5b71c35cba83e86 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 23 Sep 2021 18:54:38 +0200
Subject: sched: Remove preempt_offset argument from __might_sleep()

All callers hand in 0 and never will hand in anything else.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210923165358.054321586@linutronix.de
---
 include/linux/kernel.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5e4ae54da73e..f95ee786e4ef 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -112,7 +112,7 @@ static __always_inline void might_resched(void)
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 extern void __might_resched(const char *file, int line, int preempt_offset);
-extern void __might_sleep(const char *file, int line, int preempt_offset);
+extern void __might_sleep(const char *file, int line);
 extern void __cant_sleep(const char *file, int line, int preempt_offset);
 extern void __cant_migrate(const char *file, int line);
 
@@ -129,7 +129,7 @@ extern void __cant_migrate(const char *file, int line);
  * supposed to.
  */
 # define might_sleep() \
-	do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
+	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
 /**
  * cant_sleep - annotation for functions that cannot sleep
  *
@@ -170,8 +170,7 @@ extern void __cant_migrate(const char *file, int line);
 #else
   static inline void __might_resched(const char *file, int line,
 				     int preempt_offset) { }
-  static inline void __might_sleep(const char *file, int line,
-				   int preempt_offset) { }
+static inline void __might_sleep(const char *file, int line) { }
 # define might_sleep() do { might_resched(); } while (0)
 # define cant_sleep() do { } while (0)
 # define cant_migrate()		do { } while (0)
-- 
cgit v1.2.3


From 50e081b96e35e43b65591f40f7376204decd1cb5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 23 Sep 2021 18:54:43 +0200
Subject: sched: Make RCU nest depth distinct in __might_resched()

For !RT kernels RCU nest depth in __might_resched() is always expected to
be 0, but on RT kernels it can be non zero while the preempt count is
expected to be always 0.

Instead of playing magic games in interpreting the 'preempt_offset'
argument, rename it to 'offsets' and use the lower 8 bits for the expected
preempt count, allow to hand in the expected RCU nest depth in the upper
bits and adopt the __might_resched() code and related checks and printks.

The affected call sites are updated in subsequent steps.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210923165358.243232823@linutronix.de
---
 include/linux/kernel.h | 4 ++--
 include/linux/sched.h  | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f95ee786e4ef..e8696e4a45aa 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -111,7 +111,7 @@ static __always_inline void might_resched(void)
 #endif /* CONFIG_PREEMPT_* */
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-extern void __might_resched(const char *file, int line, int preempt_offset);
+extern void __might_resched(const char *file, int line, unsigned int offsets);
 extern void __might_sleep(const char *file, int line);
 extern void __cant_sleep(const char *file, int line, int preempt_offset);
 extern void __cant_migrate(const char *file, int line);
@@ -169,7 +169,7 @@ extern void __cant_migrate(const char *file, int line);
 # define non_block_end() WARN_ON(current->non_block_count-- == 0)
 #else
   static inline void __might_resched(const char *file, int line,
-				     int preempt_offset) { }
+				     unsigned int offsets) { }
 static inline void __might_sleep(const char *file, int line) { }
 # define might_sleep() do { might_resched(); } while (0)
 # define cant_sleep() do { } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7a989f2487f8..b448c7460577 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2046,6 +2046,9 @@ extern int __cond_resched_lock(spinlock_t *lock);
 extern int __cond_resched_rwlock_read(rwlock_t *lock);
 extern int __cond_resched_rwlock_write(rwlock_t *lock);
 
+#define MIGHT_RESCHED_RCU_SHIFT		8
+#define MIGHT_RESCHED_PREEMPT_MASK	((1U << MIGHT_RESCHED_RCU_SHIFT) - 1)
+
 #define cond_resched_lock(lock) ({					\
 	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
 	__cond_resched_lock(lock);					\
-- 
cgit v1.2.3


From 3e9cc688e56cc2abb9b6067f57c8397f6c96d42c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 23 Sep 2021 18:54:44 +0200
Subject: sched: Make cond_resched_lock() variants RT aware

The __might_resched() checks in the cond_resched_lock() variants use
PREEMPT_LOCK_OFFSET for preempt count offset checking which takes the
preemption disable by the spin_lock() which is still held at that point
into account.

On PREEMPT_RT enabled kernels spin/rw_lock held sections stay preemptible
which means PREEMPT_LOCK_OFFSET is 0, but that still triggers the
__might_resched() check because that takes RCU read side nesting into
account.

On RT enabled kernels spin/read/write_lock() issue rcu_read_lock() to
resemble the !RT semantics, which means in cond_resched_lock() the might
resched check will see preempt_count() == 0 and rcu_preempt_depth() == 1.

Introduce PREEMPT_LOCK_SCHED_OFFSET for those might resched checks and map
them depending on CONFIG_PREEMPT_RT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210923165358.305969211@linutronix.de
---
 include/linux/preempt.h |  5 +++--
 include/linux/sched.h   | 34 +++++++++++++++++++++++++---------
 2 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 4d244e295e85..031898b38d06 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -122,9 +122,10 @@
  * The preempt_count offset after spin_lock()
  */
 #if !defined(CONFIG_PREEMPT_RT)
-#define PREEMPT_LOCK_OFFSET	PREEMPT_DISABLE_OFFSET
+#define PREEMPT_LOCK_OFFSET		PREEMPT_DISABLE_OFFSET
 #else
-#define PREEMPT_LOCK_OFFSET	0
+/* Locks on RT do not disable preemption */
+#define PREEMPT_LOCK_OFFSET		0
 #endif
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b448c7460577..21b7cd00bf1d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2049,19 +2049,35 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
 #define MIGHT_RESCHED_RCU_SHIFT		8
 #define MIGHT_RESCHED_PREEMPT_MASK	((1U << MIGHT_RESCHED_RCU_SHIFT) - 1)
 
-#define cond_resched_lock(lock) ({					\
-	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
-	__cond_resched_lock(lock);					\
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * Non RT kernels have an elevated preempt count due to the held lock,
+ * but are not allowed to be inside a RCU read side critical section
+ */
+# define PREEMPT_LOCK_RESCHED_OFFSETS	PREEMPT_LOCK_OFFSET
+#else
+/*
+ * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in
+ * cond_resched*lock() has to take that into account because it checks for
+ * preempt_count() and rcu_preempt_depth().
+ */
+# define PREEMPT_LOCK_RESCHED_OFFSETS	\
+	(PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT))
+#endif
+
+#define cond_resched_lock(lock) ({						\
+	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);	\
+	__cond_resched_lock(lock);						\
 })
 
-#define cond_resched_rwlock_read(lock) ({				\
-	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
-	__cond_resched_rwlock_read(lock);				\
+#define cond_resched_rwlock_read(lock) ({					\
+	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);	\
+	__cond_resched_rwlock_read(lock);					\
 })
 
-#define cond_resched_rwlock_write(lock) ({				\
-	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
-	__cond_resched_rwlock_write(lock);				\
+#define cond_resched_rwlock_write(lock) ({					\
+	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);	\
+	__cond_resched_rwlock_write(lock);					\
 })
 
 static inline void cond_resched_rcu(void)
-- 
cgit v1.2.3


From 74e78adc6ccf6c3b53939788cf0c49f54db70731 Mon Sep 17 00:00:00 2001
From: Sai Krishna Potthuri <lakshmi.sai.krishna.potthuri@xilinx.com>
Date: Fri, 24 Sep 2021 15:37:08 +0530
Subject: firmware: xilinx: Add OSPI Mux selection support

Add OSPI Mux selection API support to select the AXI interface to OSPI.

Signed-off-by: Sai Krishna Potthuri <lakshmi.sai.krishna.potthuri@xilinx.com>
Link: https://lore.kernel.org/r/1632478031-12242-2-git-send-email-lakshmi.sai.krishna.potthuri@xilinx.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 56b426fe020c..4c70a6e2141e 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -123,6 +123,7 @@ enum pm_ioctl_id {
 	IOCTL_READ_PGGS = 15,
 	/* Set healthy bit value */
 	IOCTL_SET_BOOT_HEALTH_STATUS = 17,
+	IOCTL_OSPI_MUX_SELECT = 21,
 };
 
 enum pm_query_id {
@@ -351,6 +352,11 @@ enum zynqmp_pm_shutdown_subtype {
 	ZYNQMP_PM_SHUTDOWN_SUBTYPE_SYSTEM = 2,
 };
 
+enum ospi_mux_select_type {
+	PM_OSPI_MUX_SEL_DMA = 0,
+	PM_OSPI_MUX_SEL_LINEAR = 1,
+};
+
 /**
  * struct zynqmp_pm_query_data - PM query data
  * @qid:	query ID
@@ -387,6 +393,7 @@ int zynqmp_pm_set_pll_frac_data(u32 clk_id, u32 data);
 int zynqmp_pm_get_pll_frac_data(u32 clk_id, u32 *data);
 int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value);
 int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type);
+int zynqmp_pm_ospi_mux_select(u32 dev_id, u32 select);
 int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset,
 			   const enum zynqmp_pm_reset_action assert_flag);
 int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status);
@@ -508,6 +515,11 @@ static inline int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type)
 	return -ENODEV;
 }
 
+static inline int zynqmp_pm_ospi_mux_select(u32 dev_id, u32 select)
+{
+	return -ENODEV;
+}
+
 static inline int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset,
 					 const enum zynqmp_pm_reset_action assert_flag)
 {
-- 
cgit v1.2.3


From 42f355ef59a2f98fa4affb4265d3ba3e2d86baf1 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 19 May 2021 16:00:20 -0400
Subject: audit: replace magic audit syscall class numbers with macros

Replace audit syscall class magic numbers with macros.

This required putting the macros into new header file
include/linux/audit_arch.h since the syscall macros were
included for both 64 bit and 32 bit in any compat code, causing
redefinition warnings.

Link: https://lore.kernel.org/r/2300b1083a32aade7ae7efb95826e8f3f260b1df.1621363275.git.rgb@redhat.com
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
[PM: renamed header to audit_arch.h after consulting with Richard]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h      |  1 +
 include/linux/audit_arch.h | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 include/linux/audit_arch.h

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 82b7c1116a85..5fbeeeb6b726 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -11,6 +11,7 @@
 
 #include <linux/sched.h>
 #include <linux/ptrace.h>
+#include <linux/audit_arch.h>
 #include <uapi/linux/audit.h>
 #include <uapi/linux/netfilter/nf_tables.h>
 
diff --git a/include/linux/audit_arch.h b/include/linux/audit_arch.h
new file mode 100644
index 000000000000..d4a506faabb0
--- /dev/null
+++ b/include/linux/audit_arch.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* audit_arch.h -- Arch layer specific support for audit
+ *
+ * Copyright 2021 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * Author: Richard Guy Briggs <rgb@redhat.com>
+ */
+#ifndef _LINUX_AUDIT_ARCH_H_
+#define _LINUX_AUDIT_ARCH_H_
+
+enum auditsc_class_t {
+	AUDITSC_NATIVE = 0,
+	AUDITSC_COMPAT,
+	AUDITSC_OPEN,
+	AUDITSC_OPENAT,
+	AUDITSC_SOCKETCALL,
+	AUDITSC_EXECVE,
+
+	AUDITSC_NVALS /* count */
+};
+
+#endif
-- 
cgit v1.2.3


From 1c30e3af8a79260cdba833a719209b01e6b92300 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 19 May 2021 16:00:21 -0400
Subject: audit: add support for the openat2 syscall

The openat2(2) syscall was added in kernel v5.6 with commit
fddb5d430ad9 ("open: introduce openat2(2) syscall").

Add the openat2(2) syscall to the audit syscall classifier.

Link: https://github.com/linux-audit/audit-kernel/issues/67
Link: https://lore.kernel.org/r/f5f1a4d8699613f8c02ce762807228c841c2e26f.1621363275.git.rgb@redhat.com
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
[PM: merge fuzz due to previous header rename, commit line wraps]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit_arch.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/audit_arch.h b/include/linux/audit_arch.h
index d4a506faabb0..8fdb1afe251a 100644
--- a/include/linux/audit_arch.h
+++ b/include/linux/audit_arch.h
@@ -16,6 +16,7 @@ enum auditsc_class_t {
 	AUDITSC_OPENAT,
 	AUDITSC_SOCKETCALL,
 	AUDITSC_EXECVE,
+	AUDITSC_OPENAT2,
 
 	AUDITSC_NVALS /* count */
 };
-- 
cgit v1.2.3


From e8c0722927e8fc112d78cc0435d336fcd7e3507c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 1 Oct 2021 18:15:27 +0300
Subject: net: mscc: ocelot: write full VLAN TCI in the injection header

The VLAN TCI contains more than the VLAN ID, it also has the VLAN PCP
and Drop Eligibility Indicator.

If the ocelot driver is going to write the VLAN header inside the DSA
tag, it could just as well write the entire TCI.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/ocelot.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
index 435777a0073c..0fe101e8e190 100644
--- a/include/linux/dsa/ocelot.h
+++ b/include/linux/dsa/ocelot.h
@@ -210,9 +210,9 @@ static inline void ocelot_ifh_set_tag_type(void *injection, u64 tag_type)
 	packing(injection, &tag_type, 16, 16, OCELOT_TAG_LEN, PACK, 0);
 }
 
-static inline void ocelot_ifh_set_vid(void *injection, u64 vid)
+static inline void ocelot_ifh_set_vlan_tci(void *injection, u64 vlan_tci)
 {
-	packing(injection, &vid, 11, 0, OCELOT_TAG_LEN, PACK, 0);
+	packing(injection, &vlan_tci, 15, 0, OCELOT_TAG_LEN, PACK, 0);
 }
 
 #endif
-- 
cgit v1.2.3


From f3956ebb3bf06ab2266ad5ee2214aed46405810c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 1 Oct 2021 14:32:23 -0700
Subject: ethernet: use eth_hw_addr_set() instead of ether_addr_copy()

Convert Ethernet from ether_addr_copy() to eth_hw_addr_set():

  @@
  expression dev, np;
  @@
  - ether_addr_copy(dev->dev_addr, np)
  + eth_hw_addr_set(dev, np)

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 928c411bd509..e7b2e5fd8d24 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -323,7 +323,7 @@ static inline void eth_hw_addr_inherit(struct net_device *dst,
 				       struct net_device *src)
 {
 	dst->addr_assign_type = src->addr_assign_type;
-	ether_addr_copy(dst->dev_addr, src->dev_addr);
+	eth_hw_addr_set(dst, src->dev_addr);
 }
 
 /**
-- 
cgit v1.2.3


From d0c27c9211fef3ce8083cc3ad2ca3067d211edc9 Mon Sep 17 00:00:00 2001
From: Henrik Grimler <henrik@grimler.se>
Date: Wed, 29 Sep 2021 20:14:18 +0200
Subject: power: supply: max17042_battery: fix typo in MAX17042_IAvg_empty

Datasheet gives the name IAvg_empty, not LAvg_empty.

Signed-off-by: Henrik Grimler <henrik@grimler.se>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power/max17042_battery.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h
index dd24756a8af7..c417abd2ab70 100644
--- a/include/linux/power/max17042_battery.h
+++ b/include/linux/power/max17042_battery.h
@@ -78,7 +78,7 @@ enum max17042_register {
 	MAX17042_T_empty	= 0x34,
 
 	MAX17042_FullCAP0       = 0x35,
-	MAX17042_LAvg_empty	= 0x36,
+	MAX17042_IAvg_empty	= 0x36,
 	MAX17042_FCTC		= 0x37,
 	MAX17042_RCOMP0		= 0x38,
 	MAX17042_TempCo		= 0x39,
@@ -221,7 +221,7 @@ struct max17042_config_data {
 	u16	fullcap;	/* 0x10 */
 	u16	fullcapnom;	/* 0x23 */
 	u16	socempty;	/* 0x33 */
-	u16	lavg_empty;	/* 0x36 */
+	u16	iavg_empty;	/* 0x36 */
 	u16	dqacc;		/* 0x45 */
 	u16	dpacc;		/* 0x46 */
 	u16	qrtbl00;	/* 0x12 */
-- 
cgit v1.2.3


From dae9a6cab8009e526570e7477ce858dcdfeb256e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 30 Sep 2021 17:06:21 -0400
Subject: NFSD: Have legacy NFSD WRITE decoders use xdr_stream_subsegment()

Refactor.

Now that the NFSv2 and NFSv3 XDR decoders have been converted to
use xdr_streams, the WRITE decoder functions can use
xdr_stream_subsegment() to extract the WRITE payload into its own
xdr_buf, just as the NFSv4 WRITE XDR decoder currently does.

That makes it possible to pass the first kvec, pages array + length,
page_base, and total payload length via a single function parameter.

The payload's page_base is not yet assigned or used, but will be in
subsequent patches.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 064c96157d1f..6263410c948a 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -532,8 +532,7 @@ int		   svc_encode_result_payload(struct svc_rqst *rqstp,
 					     unsigned int offset,
 					     unsigned int length);
 unsigned int	   svc_fill_write_vector(struct svc_rqst *rqstp,
-					 struct page **pages,
-					 struct kvec *first, size_t total);
+					 struct xdr_buf *payload);
 char		  *svc_fill_symlink_pathname(struct svc_rqst *rqstp,
 					     struct kvec *first, void *p,
 					     size_t total);
-- 
cgit v1.2.3


From ca05cbae2a0468e5d78e9b4605936a8bf5da328b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 10 Jul 2021 18:07:14 -0400
Subject: NFS: Fix up nfs_ctx_key_to_expire()

If the cached credential exists but doesn't have any expiration callback
then exit early.
Fix up atomicity issues when replacing the credential with a new one
since the existing code could lead to refcount leaks.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b9a8b925db43..9b75448ce0df 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -81,7 +81,7 @@ struct nfs_open_context {
 	fl_owner_t flock_owner;
 	struct dentry *dentry;
 	const struct cred *cred;
-	struct rpc_cred *ll_cred;	/* low-level cred - use to check for expiry */
+	struct rpc_cred __rcu *ll_cred;	/* low-level cred - use to check for expiry */
 	struct nfs4_state *state;
 	fmode_t mode;
 
-- 
cgit v1.2.3


From b9f8713f42af11ae6d7f63075334ba5298436be6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 12 Jul 2021 12:24:15 -0400
Subject: SUNRPC: Remove unnecessary memory barriers

The only check for RPC_TASK_RUNNING is the one in rpc_make_runnable(),
which happens under the same spin lock held when we call
rpc_clear_running().

Ditto, the last check for RPC_TASK_QUEUED in rpc_execute() is performed
under the same lock as the one held when we call rpc_clear_queued().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/sched.h | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index a237b8dbf608..db964bb63912 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -150,25 +150,13 @@ struct rpc_task_setup {
 #define RPC_TASK_MSG_PIN_WAIT	5
 #define RPC_TASK_SIGNALLED	6
 
-#define RPC_IS_RUNNING(t)	test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
-#define rpc_set_running(t)	set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
 #define rpc_test_and_set_running(t) \
 				test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
-#define rpc_clear_running(t)	\
-	do { \
-		smp_mb__before_atomic(); \
-		clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \
-		smp_mb__after_atomic(); \
-	} while (0)
+#define rpc_clear_running(t)	clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
 
 #define RPC_IS_QUEUED(t)	test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)
 #define rpc_set_queued(t)	set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)
-#define rpc_clear_queued(t)	\
-	do { \
-		smp_mb__before_atomic(); \
-		clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate); \
-		smp_mb__after_atomic(); \
-	} while (0)
+#define rpc_clear_queued(t)	clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)
 
 #define RPC_IS_ACTIVATED(t)	test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate)
 
-- 
cgit v1.2.3


From ff81dfb5d721fff87bd516c558847f6effb70031 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 28 Sep 2021 14:33:44 -0400
Subject: NFS: Further optimisations for 'ls -l'

If a user is doing 'ls -l', we have a heuristic in GETATTR that tells
the readdir code to try to use READDIRPLUS in order to refresh the inode
attributes. In certain cirumstances, we also try to invalidate the
remaining directory entries in order to ensure this refresh.

If there are multiple readers of the directory, we probably should avoid
invalidating the page cache, since the heuristic breaks down in that
situation anyway.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
---
 include/linux/nfs_fs.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 9b75448ce0df..ca547cc5458c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -103,6 +103,7 @@ struct nfs_open_dir_context {
 	__be32	verf[NFS_DIR_VERIFIER_SIZE];
 	__u64 dir_cookie;
 	__u64 dup_cookie;
+	pgoff_t page_index;
 	signed char duped;
 };
 
@@ -181,9 +182,6 @@ struct nfs_inode {
 	struct rw_semaphore	rmdir_sem;
 	struct mutex		commit_mutex;
 
-	/* track last access to cached pages */
-	unsigned long		page_index;
-
 #if IS_ENABLED(CONFIG_NFS_V4)
 	struct nfs4_cached_acl	*nfs4_acl;
         /* NFSv4 state */
@@ -272,6 +270,7 @@ struct nfs4_copy_state {
 #define NFS_INO_INVALIDATING	(3)		/* inode is being invalidated */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
+#define NFS_INO_FORCE_READDIR	(7)		/* force readdirplus */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
 #define NFS_INO_LAYOUTSTATS	(11)		/* layoutstats inflight */
-- 
cgit v1.2.3


From 3734b9f2cee01d9dde2fbbda742ba6dd6ba10a29 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 27 Sep 2021 01:40:24 +0300
Subject: opp: Change type of dev_pm_opp_attach_genpd(names) argument

Elements of the 'names' array are not changed by the code, constify them
for consistency.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_opp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 84150a22fd7c..7f1f066fc377 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -156,9 +156,9 @@ int devm_pm_opp_set_clkname(struct device *dev, const char *name);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
 int devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
-struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
+struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs);
 void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
-int devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
+int devm_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs);
 struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, struct opp_table *dst_table, struct dev_pm_opp *src_opp);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
@@ -376,7 +376,7 @@ static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name)
 	return -EOPNOTSUPP;
 }
 
-static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs)
+static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
@@ -384,7 +384,7 @@ static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, cons
 static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {}
 
 static inline int devm_pm_opp_attach_genpd(struct device *dev,
-					   const char **names,
+					   const char * const *names,
 					   struct device ***virt_devs)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 46b49b12f3fc5e1347dba37d4639e2165f447871 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 8 Sep 2021 17:58:33 -0500
Subject: arch/cc: Introduce a function to check for confidential computing
 features

In preparation for other confidential computing technologies, introduce
a generic helper function, cc_platform_has(), that can be used to
check for specific active confidential computing attributes, like
memory encryption. This is intended to eliminate having to add multiple
technology-specific checks to the code (e.g. if (sev_active() ||
tdx_active() || ... ).

 [ bp: s/_CC_PLATFORM_H/_LINUX_CC_PLATFORM_H/g ]

Co-developed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Co-developed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210928191009.32551-3-bp@alien8.de
---
 include/linux/cc_platform.h | 88 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 include/linux/cc_platform.h

(limited to 'include/linux')

diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h
new file mode 100644
index 000000000000..a075b70b9a70
--- /dev/null
+++ b/include/linux/cc_platform.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Confidential Computing Platform Capability checks
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#ifndef _LINUX_CC_PLATFORM_H
+#define _LINUX_CC_PLATFORM_H
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+
+/**
+ * enum cc_attr - Confidential computing attributes
+ *
+ * These attributes represent confidential computing features that are
+ * currently active.
+ */
+enum cc_attr {
+	/**
+	 * @CC_ATTR_MEM_ENCRYPT: Memory encryption is active
+	 *
+	 * The platform/OS is running with active memory encryption. This
+	 * includes running either as a bare-metal system or a hypervisor
+	 * and actively using memory encryption or as a guest/virtual machine
+	 * and actively using memory encryption.
+	 *
+	 * Examples include SME, SEV and SEV-ES.
+	 */
+	CC_ATTR_MEM_ENCRYPT,
+
+	/**
+	 * @CC_ATTR_HOST_MEM_ENCRYPT: Host memory encryption is active
+	 *
+	 * The platform/OS is running as a bare-metal system or a hypervisor
+	 * and actively using memory encryption.
+	 *
+	 * Examples include SME.
+	 */
+	CC_ATTR_HOST_MEM_ENCRYPT,
+
+	/**
+	 * @CC_ATTR_GUEST_MEM_ENCRYPT: Guest memory encryption is active
+	 *
+	 * The platform/OS is running as a guest/virtual machine and actively
+	 * using memory encryption.
+	 *
+	 * Examples include SEV and SEV-ES.
+	 */
+	CC_ATTR_GUEST_MEM_ENCRYPT,
+
+	/**
+	 * @CC_ATTR_GUEST_STATE_ENCRYPT: Guest state encryption is active
+	 *
+	 * The platform/OS is running as a guest/virtual machine and actively
+	 * using memory encryption and register state encryption.
+	 *
+	 * Examples include SEV-ES.
+	 */
+	CC_ATTR_GUEST_STATE_ENCRYPT,
+};
+
+#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+
+/**
+ * cc_platform_has() - Checks if the specified cc_attr attribute is active
+ * @attr: Confidential computing attribute to check
+ *
+ * The cc_platform_has() function will return an indicator as to whether the
+ * specified Confidential Computing attribute is currently active.
+ *
+ * Context: Any context
+ * Return:
+ * * TRUE  - Specified Confidential Computing attribute is active
+ * * FALSE - Specified Confidential Computing attribute is not active
+ */
+bool cc_platform_has(enum cc_attr attr);
+
+#else	/* !CONFIG_ARCH_HAS_CC_PLATFORM */
+
+static inline bool cc_platform_has(enum cc_attr attr) { return false; }
+
+#endif	/* CONFIG_ARCH_HAS_CC_PLATFORM */
+
+#endif	/* _LINUX_CC_PLATFORM_H */
-- 
cgit v1.2.3


From e9d1d2bb75b2d5d4b426769c5aae0ce8cef3558f Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 8 Sep 2021 17:58:39 -0500
Subject: treewide: Replace the use of mem_encrypt_active() with
 cc_platform_has()

Replace uses of mem_encrypt_active() with calls to cc_platform_has() with
the CC_ATTR_MEM_ENCRYPT attribute.

Remove the implementation of mem_encrypt_active() across all arches.

For s390, since the default implementation of the cc_platform_has()
matches the s390 implementation of mem_encrypt_active(), cc_platform_has()
does not need to be implemented in s390 (the config option
ARCH_HAS_CC_PLATFORM is not set).

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210928191009.32551-9-bp@alien8.de
---
 include/linux/mem_encrypt.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 5c4a18a91f89..ae4526389261 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -16,10 +16,6 @@
 
 #include <asm/mem_encrypt.h>
 
-#else	/* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
-
-static inline bool mem_encrypt_active(void) { return false; }
-
 #endif	/* CONFIG_ARCH_HAS_MEM_ENCRYPT */
 
 #ifdef CONFIG_AMD_MEM_ENCRYPT
-- 
cgit v1.2.3


From e69709f6861aaba80b4eaab6825e8c522a2afb5c Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 20 Sep 2021 20:22:46 +0300
Subject: opp: Add more resource-managed variants of dev_pm_opp_of_add_table()

Add resource-managed variants of dev_pm_opp_of_add_table_indexed() and
dev_pm_opp_of_add_table_noclk(), allowing drivers to remove boilerplate
code.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
[ Viresh: Added underscore to devm_of_add_table_indexed() ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_opp.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 7f1f066fc377..879c138c7b8e 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -439,7 +439,9 @@ static inline int dev_pm_opp_sync_regulators(struct device *dev)
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
 int dev_pm_opp_of_add_table(struct device *dev);
 int dev_pm_opp_of_add_table_indexed(struct device *dev, int index);
+int devm_pm_opp_of_add_table_indexed(struct device *dev, int index);
 int dev_pm_opp_of_add_table_noclk(struct device *dev, int index);
+int devm_pm_opp_of_add_table_noclk(struct device *dev, int index);
 void dev_pm_opp_of_remove_table(struct device *dev);
 int devm_pm_opp_of_add_table(struct device *dev);
 int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask);
@@ -465,11 +467,21 @@ static inline int dev_pm_opp_of_add_table_indexed(struct device *dev, int index)
 	return -EOPNOTSUPP;
 }
 
+static inline int devm_pm_opp_of_add_table_indexed(struct device *dev, int index)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int dev_pm_opp_of_add_table_noclk(struct device *dev, int index)
 {
 	return -EOPNOTSUPP;
 }
 
+static inline int devm_pm_opp_of_add_table_noclk(struct device *dev, int index)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline void dev_pm_opp_of_remove_table(struct device *dev)
 {
 }
-- 
cgit v1.2.3


From 19198e4ec97dc9d173b458a75ace3c3ca55c2d85 Mon Sep 17 00:00:00 2001
From: Prabhakar Kushwaha <pkushwaha@marvell.com>
Date: Mon, 4 Oct 2021 09:58:39 +0300
Subject: qed: Fix kernel-doc warnings

This patch fixes all the qed and qede kernel-doc warnings
according to the guidelines that are described in
Documentation/doc-guide/kernel-doc.rst.

Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_chain.h      |  97 +++++++-------
 include/linux/qed/qed_if.h         | 255 ++++++++++++++++++++-----------------
 include/linux/qed/qed_iscsi_if.h   |   2 +-
 include/linux/qed/qed_ll2_if.h     |  42 +++---
 include/linux/qed/qed_nvmetcp_if.h |  17 +++
 5 files changed, 230 insertions(+), 183 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h
index f34dbd0db795..a84063492c71 100644
--- a/include/linux/qed/qed_chain.h
+++ b/include/linux/qed/qed_chain.h
@@ -268,14 +268,15 @@ static inline dma_addr_t qed_chain_get_pbl_phys(const struct qed_chain *chain)
 }
 
 /**
- * @brief qed_chain_advance_page -
+ * qed_chain_advance_page(): Advance the next element across pages for a
+ *                           linked chain.
  *
- * Advance the next element across pages for a linked chain
+ * @p_chain: P_chain.
+ * @p_next_elem: P_next_elem.
+ * @idx_to_inc: Idx_to_inc.
+ * @page_to_inc: page_to_inc.
  *
- * @param p_chain
- * @param p_next_elem
- * @param idx_to_inc
- * @param page_to_inc
+ * Return: Void.
  */
 static inline void
 qed_chain_advance_page(struct qed_chain *p_chain,
@@ -336,12 +337,14 @@ qed_chain_advance_page(struct qed_chain *p_chain,
 	} while (0)
 
 /**
- * @brief qed_chain_return_produced -
+ * qed_chain_return_produced(): A chain in which the driver "Produces"
+ *                              elements should use this API
+ *                              to indicate previous produced elements
+ *                              are now consumed.
  *
- * A chain in which the driver "Produces" elements should use this API
- * to indicate previous produced elements are now consumed.
+ * @p_chain: Chain.
  *
- * @param p_chain
+ * Return: Void.
  */
 static inline void qed_chain_return_produced(struct qed_chain *p_chain)
 {
@@ -353,15 +356,15 @@ static inline void qed_chain_return_produced(struct qed_chain *p_chain)
 }
 
 /**
- * @brief qed_chain_produce -
+ * qed_chain_produce(): A chain in which the driver "Produces"
+ *                      elements should use this to get a pointer to
+ *                      the next element which can be "Produced". It's driver
+ *                      responsibility to validate that the chain has room for
+ *                      new element.
  *
- * A chain in which the driver "Produces" elements should use this to get
- * a pointer to the next element which can be "Produced". It's driver
- * responsibility to validate that the chain has room for new element.
+ * @p_chain: Chain.
  *
- * @param p_chain
- *
- * @return void*, a pointer to next element
+ * Return: void*, a pointer to next element.
  */
 static inline void *qed_chain_produce(struct qed_chain *p_chain)
 {
@@ -395,14 +398,11 @@ static inline void *qed_chain_produce(struct qed_chain *p_chain)
 }
 
 /**
- * @brief qed_chain_get_capacity -
- *
- * Get the maximum number of BDs in chain
+ * qed_chain_get_capacity(): Get the maximum number of BDs in chain
  *
- * @param p_chain
- * @param num
+ * @p_chain: Chain.
  *
- * @return number of unusable BDs
+ * Return: number of unusable BDs.
  */
 static inline u32 qed_chain_get_capacity(struct qed_chain *p_chain)
 {
@@ -410,12 +410,14 @@ static inline u32 qed_chain_get_capacity(struct qed_chain *p_chain)
 }
 
 /**
- * @brief qed_chain_recycle_consumed -
+ * qed_chain_recycle_consumed(): Returns an element which was
+ *                               previously consumed;
+ *                               Increments producers so they could
+ *                               be written to FW.
  *
- * Returns an element which was previously consumed;
- * Increments producers so they could be written to FW.
+ * @p_chain: Chain.
  *
- * @param p_chain
+ * Return: Void.
  */
 static inline void qed_chain_recycle_consumed(struct qed_chain *p_chain)
 {
@@ -427,14 +429,13 @@ static inline void qed_chain_recycle_consumed(struct qed_chain *p_chain)
 }
 
 /**
- * @brief qed_chain_consume -
+ * qed_chain_consume(): A Chain in which the driver utilizes data written
+ *                      by a different source (i.e., FW) should use this to
+ *                      access passed buffers.
  *
- * A Chain in which the driver utilizes data written by a different source
- * (i.e., FW) should use this to access passed buffers.
+ * @p_chain: Chain.
  *
- * @param p_chain
- *
- * @return void*, a pointer to the next buffer written
+ * Return: void*, a pointer to the next buffer written.
  */
 static inline void *qed_chain_consume(struct qed_chain *p_chain)
 {
@@ -468,9 +469,11 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain)
 }
 
 /**
- * @brief qed_chain_reset - Resets the chain to its start state
+ * qed_chain_reset(): Resets the chain to its start state.
+ *
+ * @p_chain: pointer to a previously allocated chain.
  *
- * @param p_chain pointer to a previously allocated chain
+ * Return Void.
  */
 static inline void qed_chain_reset(struct qed_chain *p_chain)
 {
@@ -519,13 +522,12 @@ static inline void qed_chain_reset(struct qed_chain *p_chain)
 }
 
 /**
- * @brief qed_chain_get_last_elem -
+ * qed_chain_get_last_elem(): Returns a pointer to the last element of the
+ *                            chain.
  *
- * Returns a pointer to the last element of the chain
+ * @p_chain: Chain.
  *
- * @param p_chain
- *
- * @return void*
+ * Return: void*.
  */
 static inline void *qed_chain_get_last_elem(struct qed_chain *p_chain)
 {
@@ -563,10 +565,13 @@ out:
 }
 
 /**
- * @brief qed_chain_set_prod - sets the prod to the given value
+ * qed_chain_set_prod(): sets the prod to the given value.
+ *
+ * @p_chain: Chain.
+ * @prod_idx: Prod Idx.
+ * @p_prod_elem: Prod elem.
  *
- * @param prod_idx
- * @param p_prod_elem
+ * Return Void.
  */
 static inline void qed_chain_set_prod(struct qed_chain *p_chain,
 				      u32 prod_idx, void *p_prod_elem)
@@ -610,9 +615,11 @@ static inline void qed_chain_set_prod(struct qed_chain *p_chain,
 }
 
 /**
- * @brief qed_chain_pbl_zero_mem - set chain memory to 0
+ * qed_chain_pbl_zero_mem(): set chain memory to 0.
+ *
+ * @p_chain: Chain.
  *
- * @param p_chain
+ * Return: Void.
  */
 static inline void qed_chain_pbl_zero_mem(struct qed_chain *p_chain)
 {
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 850b98991670..f39451aaaeec 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -819,47 +819,47 @@ struct qed_common_cb_ops {
 
 struct qed_selftest_ops {
 /**
- * @brief selftest_interrupt - Perform interrupt test
+ * selftest_interrupt(): Perform interrupt test.
  *
- * @param cdev
+ * @cdev: Qed dev pointer.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int (*selftest_interrupt)(struct qed_dev *cdev);
 
 /**
- * @brief selftest_memory - Perform memory test
+ * selftest_memory(): Perform memory test.
  *
- * @param cdev
+ * @cdev: Qed dev pointer.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int (*selftest_memory)(struct qed_dev *cdev);
 
 /**
- * @brief selftest_register - Perform register test
+ * selftest_register(): Perform register test.
  *
- * @param cdev
+ * @cdev: Qed dev pointer.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int (*selftest_register)(struct qed_dev *cdev);
 
 /**
- * @brief selftest_clock - Perform clock test
+ * selftest_clock(): Perform clock test.
  *
- * @param cdev
+ * @cdev: Qed dev pointer.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int (*selftest_clock)(struct qed_dev *cdev);
 
 /**
- * @brief selftest_nvram - Perform nvram test
+ * selftest_nvram(): Perform nvram test.
  *
- * @param cdev
+ * @cdev: Qed dev pointer.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int (*selftest_nvram) (struct qed_dev *cdev);
 };
@@ -927,47 +927,53 @@ struct qed_common_ops {
 				  enum qed_hw_err_type err_type);
 
 /**
- * @brief can_link_change - can the instance change the link or not
+ * can_link_change(): can the instance change the link or not.
  *
- * @param cdev
+ * @cdev: Qed dev pointer.
  *
- * @return true if link-change is allowed, false otherwise.
+ * Return: true if link-change is allowed, false otherwise.
  */
 	bool (*can_link_change)(struct qed_dev *cdev);
 
 /**
- * @brief set_link - set links according to params
+ * set_link(): set links according to params.
  *
- * @param cdev
- * @param params - values used to override the default link configuration
+ * @cdev: Qed dev pointer.
+ * @params: values used to override the default link configuration.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int		(*set_link)(struct qed_dev *cdev,
 				    struct qed_link_params *params);
 
 /**
- * @brief get_link - returns the current link state.
+ * get_link(): returns the current link state.
  *
- * @param cdev
- * @param if_link - structure to be filled with current link configuration.
+ * @cdev: Qed dev pointer.
+ * @if_link: structure to be filled with current link configuration.
+ *
+ * Return: Void.
  */
 	void		(*get_link)(struct qed_dev *cdev,
 				    struct qed_link_output *if_link);
 
 /**
- * @brief - drains chip in case Tx completions fail to arrive due to pause.
+ * drain(): drains chip in case Tx completions fail to arrive due to pause.
+ *
+ * @cdev: Qed dev pointer.
  *
- * @param cdev
+ * Return: Int.
  */
 	int		(*drain)(struct qed_dev *cdev);
 
 /**
- * @brief update_msglvl - update module debug level
+ * update_msglvl(): update module debug level.
  *
- * @param cdev
- * @param dp_module
- * @param dp_level
+ * @cdev: Qed dev pointer.
+ * @dp_module: Debug module.
+ * @dp_level: Debug level.
+ *
+ * Return: Void.
  */
 	void		(*update_msglvl)(struct qed_dev *cdev,
 					 u32 dp_module,
@@ -981,70 +987,73 @@ struct qed_common_ops {
 				      struct qed_chain *p_chain);
 
 /**
- * @brief nvm_flash - Flash nvm data.
+ * nvm_flash(): Flash nvm data.
  *
- * @param cdev
- * @param name - file containing the data
+ * @cdev: Qed dev pointer.
+ * @name: file containing the data.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int (*nvm_flash)(struct qed_dev *cdev, const char *name);
 
 /**
- * @brief nvm_get_image - reads an entire image from nvram
+ * nvm_get_image(): reads an entire image from nvram.
  *
- * @param cdev
- * @param type - type of the request nvram image
- * @param buf - preallocated buffer to fill with the image
- * @param len - length of the allocated buffer
+ * @cdev: Qed dev pointer.
+ * @type: type of the request nvram image.
+ * @buf: preallocated buffer to fill with the image.
+ * @len: length of the allocated buffer.
  *
- * @return 0 on success, error otherwise
+ * Return: 0 on success, error otherwise.
  */
 	int (*nvm_get_image)(struct qed_dev *cdev,
 			     enum qed_nvm_images type, u8 *buf, u16 len);
 
 /**
- * @brief set_coalesce - Configure Rx coalesce value in usec
+ * set_coalesce(): Configure Rx coalesce value in usec.
  *
- * @param cdev
- * @param rx_coal - Rx coalesce value in usec
- * @param tx_coal - Tx coalesce value in usec
- * @param qid - Queue index
- * @param sb_id - Status Block Id
+ * @cdev: Qed dev pointer.
+ * @rx_coal: Rx coalesce value in usec.
+ * @tx_coal: Tx coalesce value in usec.
+ * @handle: Handle.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int (*set_coalesce)(struct qed_dev *cdev,
 			    u16 rx_coal, u16 tx_coal, void *handle);
 
 /**
- * @brief set_led - Configure LED mode
+ * set_led() - Configure LED mode.
  *
- * @param cdev
- * @param mode - LED mode
+ * @cdev: Qed dev pointer.
+ * @mode: LED mode.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int (*set_led)(struct qed_dev *cdev,
 		       enum qed_led_mode mode);
 
 /**
- * @brief attn_clr_enable - Prevent attentions from being reasserted
+ * attn_clr_enable(): Prevent attentions from being reasserted.
+ *
+ * @cdev: Qed dev pointer.
+ * @clr_enable: Clear enable.
  *
- * @param cdev
- * @param clr_enable
+ * Return: Void.
  */
 	void (*attn_clr_enable)(struct qed_dev *cdev, bool clr_enable);
 
 /**
- * @brief db_recovery_add - add doorbell information to the doorbell
- * recovery mechanism.
+ * db_recovery_add(): add doorbell information to the doorbell
+ *                    recovery mechanism.
  *
- * @param cdev
- * @param db_addr - doorbell address
- * @param db_data - address of where db_data is stored
- * @param db_is_32b - doorbell is 32b pr 64b
- * @param db_is_user - doorbell recovery addresses are user or kernel space
+ * @cdev: Qed dev pointer.
+ * @db_addr: Doorbell address.
+ * @db_data: Dddress of where db_data is stored.
+ * @db_width: Doorbell is 32b or 64b.
+ * @db_space: Doorbell recovery addresses are user or kernel space.
+ *
+ * Return: Int.
  */
 	int (*db_recovery_add)(struct qed_dev *cdev,
 			       void __iomem *db_addr,
@@ -1053,114 +1062,130 @@ struct qed_common_ops {
 			       enum qed_db_rec_space db_space);
 
 /**
- * @brief db_recovery_del - remove doorbell information from the doorbell
+ * db_recovery_del(): remove doorbell information from the doorbell
  * recovery mechanism. db_data serves as key (db_addr is not unique).
  *
- * @param cdev
- * @param db_addr - doorbell address
- * @param db_data - address where db_data is stored. Serves as key for the
- *		    entry to delete.
+ * @cdev: Qed dev pointer.
+ * @db_addr: Doorbell address.
+ * @db_data: Address where db_data is stored. Serves as key for the
+ *           entry to delete.
+ *
+ * Return: Int.
  */
 	int (*db_recovery_del)(struct qed_dev *cdev,
 			       void __iomem *db_addr, void *db_data);
 
 /**
- * @brief recovery_process - Trigger a recovery process
+ * recovery_process(): Trigger a recovery process.
  *
- * @param cdev
+ * @cdev: Qed dev pointer.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int (*recovery_process)(struct qed_dev *cdev);
 
 /**
- * @brief recovery_prolog - Execute the prolog operations of a recovery process
+ * recovery_prolog(): Execute the prolog operations of a recovery process.
  *
- * @param cdev
+ * @cdev: Qed dev pointer.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int (*recovery_prolog)(struct qed_dev *cdev);
 
 /**
- * @brief update_drv_state - API to inform the change in the driver state.
+ * update_drv_state(): API to inform the change in the driver state.
  *
- * @param cdev
- * @param active
+ * @cdev: Qed dev pointer.
+ * @active: Active
  *
+ * Return: Int.
  */
 	int (*update_drv_state)(struct qed_dev *cdev, bool active);
 
 /**
- * @brief update_mac - API to inform the change in the mac address
+ * update_mac(): API to inform the change in the mac address.
  *
- * @param cdev
- * @param mac
+ * @cdev: Qed dev pointer.
+ * @mac: MAC.
  *
+ * Return: Int.
  */
 	int (*update_mac)(struct qed_dev *cdev, u8 *mac);
 
 /**
- * @brief update_mtu - API to inform the change in the mtu
+ * update_mtu(): API to inform the change in the mtu.
  *
- * @param cdev
- * @param mtu
+ * @cdev: Qed dev pointer.
+ * @mtu: MTU.
  *
+ * Return: Int.
  */
 	int (*update_mtu)(struct qed_dev *cdev, u16 mtu);
 
 /**
- * @brief update_wol - update of changes in the WoL configuration
+ * update_wol(): Update of changes in the WoL configuration.
+ *
+ * @cdev: Qed dev pointer.
+ * @enabled: true iff WoL should be enabled.
  *
- * @param cdev
- * @param enabled - true iff WoL should be enabled.
+ * Return: Int.
  */
 	int (*update_wol) (struct qed_dev *cdev, bool enabled);
 
 /**
- * @brief read_module_eeprom
+ * read_module_eeprom(): Read EEPROM.
  *
- * @param cdev
- * @param buf - buffer
- * @param dev_addr - PHY device memory region
- * @param offset - offset into eeprom contents to be read
- * @param len - buffer length, i.e., max bytes to be read
+ * @cdev: Qed dev pointer.
+ * @buf: buffer.
+ * @dev_addr: PHY device memory region.
+ * @offset: offset into eeprom contents to be read.
+ * @len: buffer length, i.e., max bytes to be read.
+ *
+ * Return: Int.
  */
 	int (*read_module_eeprom)(struct qed_dev *cdev,
 				  char *buf, u8 dev_addr, u32 offset, u32 len);
 
 /**
- * @brief get_affin_hwfn_idx
+ * get_affin_hwfn_idx(): Get affine HW function.
+ *
+ * @cdev: Qed dev pointer.
  *
- * @param cdev
+ * Return: u8.
  */
 	u8 (*get_affin_hwfn_idx)(struct qed_dev *cdev);
 
 /**
- * @brief read_nvm_cfg - Read NVM config attribute value.
- * @param cdev
- * @param buf - buffer
- * @param cmd - NVM CFG command id
- * @param entity_id - Entity id
+ * read_nvm_cfg(): Read NVM config attribute value.
+ *
+ * @cdev: Qed dev pointer.
+ * @buf: Buffer.
+ * @cmd: NVM CFG command id.
+ * @entity_id: Entity id.
  *
+ * Return: Int.
  */
 	int (*read_nvm_cfg)(struct qed_dev *cdev, u8 **buf, u32 cmd,
 			    u32 entity_id);
 /**
- * @brief read_nvm_cfg - Read NVM config attribute value.
- * @param cdev
- * @param cmd - NVM CFG command id
+ * read_nvm_cfg_len(): Read NVM config attribute value.
  *
- * @return config id length, 0 on error.
+ * @cdev: Qed dev pointer.
+ * @cmd: NVM CFG command id.
+ *
+ * Return: config id length, 0 on error.
  */
 	int (*read_nvm_cfg_len)(struct qed_dev *cdev, u32 cmd);
 
 /**
- * @brief set_grc_config - Configure value for grc config id.
- * @param cdev
- * @param cfg_id - grc config id
- * @param val - grc config value
+ * set_grc_config(): Configure value for grc config id.
+ *
+ * @cdev: Qed dev pointer.
+ * @cfg_id: grc config id
+ * @val: grc config value
  *
+ * Return: Int.
  */
 	int (*set_grc_config)(struct qed_dev *cdev, u32 cfg_id, u32 val);
 
@@ -1397,18 +1422,16 @@ static inline u16 qed_sb_update_sb_idx(struct qed_sb_info *sb_info)
 }
 
 /**
+ * qed_sb_ack(): This function creates an update command for interrupts
+ *               that is  written to the IGU.
  *
- * @brief This function creates an update command for interrupts that is
- *        written to the IGU.
- *
- * @param sb_info       - This is the structure allocated and
- *                 initialized per status block. Assumption is
- *                 that it was initialized using qed_sb_init
- * @param int_cmd       - Enable/Disable/Nop
- * @param upd_flg       - whether igu consumer should be
- *                 updated.
+ * @sb_info: This is the structure allocated and
+ *           initialized per status block. Assumption is
+ *           that it was initialized using qed_sb_init
+ * @int_cmd: Enable/Disable/Nop
+ * @upd_flg: Whether igu consumer should be updated.
  *
- * @return inline void
+ * Return: inline void.
  */
 static inline void qed_sb_ack(struct qed_sb_info *sb_info,
 			      enum igu_int_cmd int_cmd,
diff --git a/include/linux/qed/qed_iscsi_if.h b/include/linux/qed/qed_iscsi_if.h
index 04180d9af560..494cdc3cd840 100644
--- a/include/linux/qed/qed_iscsi_if.h
+++ b/include/linux/qed/qed_iscsi_if.h
@@ -182,7 +182,7 @@ struct qed_iscsi_cb_ops {
  *			@param stats - pointer to struck that would be filled
  *				we stats
  *			@return 0 on success, error otherwise.
- * @change_mac		Change MAC of interface
+ * @change_mac:		Change MAC of interface
  *			@param cdev
  *			@param handle - the connection handle.
  *			@param mac - new MAC to configure.
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index ff808d248883..5b67cd03276e 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -208,57 +208,57 @@ enum qed_ll2_xmit_flags {
 
 struct qed_ll2_ops {
 /**
- * @brief start - initializes ll2
+ * start(): Initializes ll2.
  *
- * @param cdev
- * @param params - protocol driver configuration for the ll2.
+ * @cdev: Qed dev pointer.
+ * @params: Protocol driver configuration for the ll2.
  *
- * @return 0 on success, otherwise error value.
+ * Return: 0 on success, otherwise error value.
  */
 	int (*start)(struct qed_dev *cdev, struct qed_ll2_params *params);
 
 /**
- * @brief stop - stops the ll2
+ * stop(): Stops the ll2
  *
- * @param cdev
+ * @cdev: Qed dev pointer.
  *
- * @return 0 on success, otherwise error value.
+ * Return: 0 on success, otherwise error value.
  */
 	int (*stop)(struct qed_dev *cdev);
 
 /**
- * @brief start_xmit - transmits an skb over the ll2 interface
+ * start_xmit(): Transmits an skb over the ll2 interface
  *
- * @param cdev
- * @param skb
- * @param xmit_flags - Transmit options defined by the enum qed_ll2_xmit_flags.
+ * @cdev: Qed dev pointer.
+ * @skb: SKB.
+ * @xmit_flags: Transmit options defined by the enum qed_ll2_xmit_flags.
  *
- * @return 0 on success, otherwise error value.
+ * Return: 0 on success, otherwise error value.
  */
 	int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb,
 			  unsigned long xmit_flags);
 
 /**
- * @brief register_cb_ops - protocol driver register the callback for Rx/Tx
+ * register_cb_ops(): Protocol driver register the callback for Rx/Tx
  * packets. Should be called before `start'.
  *
- * @param cdev
- * @param cookie - to be passed to the callback functions.
- * @param ops - the callback functions to register for Rx / Tx.
+ * @cdev: Qed dev pointer.
+ * @cookie: to be passed to the callback functions.
+ * @ops: the callback functions to register for Rx / Tx.
  *
- * @return 0 on success, otherwise error value.
+ * Return: 0 on success, otherwise error value.
  */
 	void (*register_cb_ops)(struct qed_dev *cdev,
 				const struct qed_ll2_cb_ops *ops,
 				void *cookie);
 
 /**
- * @brief get LL2 related statistics
+ * get_stats(): Get LL2 related statistics.
  *
- * @param cdev
- * @param stats - pointer to struct that would be filled with stats
+ * @cdev: Qed dev pointer.
+ * @stats: Pointer to struct that would be filled with stats.
  *
- * @return 0 on success, error otherwise.
+ * Return: 0 on success, error otherwise.
  */
 	int (*get_stats)(struct qed_dev *cdev, struct qed_ll2_stats *stats);
 };
diff --git a/include/linux/qed/qed_nvmetcp_if.h b/include/linux/qed/qed_nvmetcp_if.h
index 14671bc19ed1..1d51df347560 100644
--- a/include/linux/qed/qed_nvmetcp_if.h
+++ b/include/linux/qed/qed_nvmetcp_if.h
@@ -171,6 +171,23 @@ struct nvmetcp_task_params {
  *			@param dest_port
  * @clear_all_filters: Clear all filters.
  *			@param cdev
+ * @init_read_io: Init read IO.
+ *			@task_params
+ *			@cmd_pdu_header
+ *			@nvme_cmd
+ *			@sgl_task_params
+ * @init_write_io: Init write IO.
+ *			@task_params
+ *			@cmd_pdu_header
+ *			@nvme_cmd
+ *			@sgl_task_params
+ * @init_icreq_exchange: Exchange ICReq.
+ *			@task_params
+ *			@init_conn_req_pdu_hdr
+ *			@tx_sgl_task_params
+ *			@rx_sgl_task_params
+ * @init_task_cleanup: Init task cleanup.
+ *			@task_params
  */
 struct qed_nvmetcp_ops {
 	const struct qed_common_ops *common;
-- 
cgit v1.2.3


From fb09a1ed5c6e507499a9da54bfd34f71a2673961 Mon Sep 17 00:00:00 2001
From: Shai Malin <smalin@marvell.com>
Date: Mon, 4 Oct 2021 09:58:40 +0300
Subject: qed: Remove e4_ and _e4 from FW HSI

The existing qed/qede/qedr/qedi/qedf code uses chip-specific naming in
structures,  functions, variables and defines in FW HSI (Hardware
Software Interface).

The new FW version introduced a generic naming convention in HSI
in-which the same code will be used across different versions
for simpler maintainability. It also eases in providing support for
new features.

With this patch every "_e4" or "e4_" prefix or suffix is not needed
anymore and it will be removed.

Reviewed-by: Manish Rangankar <mrangankar@marvell.com>
Reviewed-by: Javed Hasan <jhasan@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/common_hsi.h     |  28 +--
 include/linux/qed/fcoe_common.h    | 362 ++++++++++++++++++-------------------
 include/linux/qed/iscsi_common.h   | 360 ++++++++++++++++++------------------
 include/linux/qed/nvmetcp_common.h |  18 +-
 include/linux/qed/qed_if.h         |   5 +-
 5 files changed, 386 insertions(+), 387 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 0a3807e927c5..3742d1f7d1f7 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -133,7 +133,7 @@
 #define NUM_OF_TCS		(NUM_OF_PHYS_TCS + 1)
 
 /* CIDs */
-#define NUM_OF_CONNECTION_TYPES_E4	(8)
+#define NUM_OF_CONNECTION_TYPES	(8)
 #define NUM_OF_LCIDS			(320)
 #define NUM_OF_LTIDS			(320)
 
@@ -379,7 +379,7 @@
 #define CAU_FSM_ETH_TX  1
 
 /* Number of Protocol Indices per Status Block */
-#define PIS_PER_SB_E4	12
+#define PIS_PER_SB	12
 #define MAX_PIS_PER_SB	PIS_PER_SB
 
 #define CAU_HC_STOPPED_STATE	3
@@ -1221,20 +1221,20 @@ struct rdif_task_context {
 };
 
 /* Status block structure */
-struct status_block_e4 {
-	__le16	pi_array[PIS_PER_SB_E4];
+struct status_block {
+	__le16	pi_array[PIS_PER_SB];
 	__le32	sb_num;
-#define STATUS_BLOCK_E4_SB_NUM_MASK	0x1FF
-#define STATUS_BLOCK_E4_SB_NUM_SHIFT	0
-#define STATUS_BLOCK_E4_ZERO_PAD_MASK	0x7F
-#define STATUS_BLOCK_E4_ZERO_PAD_SHIFT	9
-#define STATUS_BLOCK_E4_ZERO_PAD2_MASK	0xFFFF
-#define STATUS_BLOCK_E4_ZERO_PAD2_SHIFT	16
+#define STATUS_BLOCK_SB_NUM_MASK	0x1FF
+#define STATUS_BLOCK_SB_NUM_SHIFT	0
+#define STATUS_BLOCK_ZERO_PAD_MASK	0x7F
+#define STATUS_BLOCK_ZERO_PAD_SHIFT	9
+#define STATUS_BLOCK_ZERO_PAD2_MASK	0xFFFF
+#define STATUS_BLOCK_ZERO_PAD2_SHIFT	16
 	__le32 prod_index;
-#define STATUS_BLOCK_E4_PROD_INDEX_MASK		0xFFFFFF
-#define STATUS_BLOCK_E4_PROD_INDEX_SHIFT	0
-#define STATUS_BLOCK_E4_ZERO_PAD3_MASK		0xFF
-#define STATUS_BLOCK_E4_ZERO_PAD3_SHIFT		24
+#define STATUS_BLOCK_PROD_INDEX_MASK		0xFFFFFF
+#define STATUS_BLOCK_PROD_INDEX_SHIFT	0
+#define STATUS_BLOCK_ZERO_PAD3_MASK		0xFF
+#define STATUS_BLOCK_ZERO_PAD3_SHIFT		24
 };
 
 /* Tdif context */
diff --git a/include/linux/qed/fcoe_common.h b/include/linux/qed/fcoe_common.h
index 68eda1c21cde..7ba0abc867f1 100644
--- a/include/linux/qed/fcoe_common.h
+++ b/include/linux/qed/fcoe_common.h
@@ -150,49 +150,49 @@ struct ystorm_fcoe_task_st_ctx {
 	u8 reserved2[8];
 };
 
-struct e4_ystorm_fcoe_task_ag_ctx {
+struct ystorm_fcoe_task_ag_ctx {
 	u8 byte0;
 	u8 byte1;
 	__le16 word0;
 	u8 flags0;
-#define E4_YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_MASK		0xF
-#define E4_YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_SHIFT	0
-#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT0_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT0_SHIFT		4
-#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT1_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT		5
-#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT2_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT		6
-#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT3_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT		7
+#define YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_MASK		0xF
+#define YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_SHIFT	0
+#define YSTORM_FCOE_TASK_AG_CTX_BIT0_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_BIT0_SHIFT		4
+#define YSTORM_FCOE_TASK_AG_CTX_BIT1_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT		5
+#define YSTORM_FCOE_TASK_AG_CTX_BIT2_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT		6
+#define YSTORM_FCOE_TASK_AG_CTX_BIT3_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT		7
 	u8 flags1;
-#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0_MASK		0x3
-#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0_SHIFT		0
-#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1_MASK		0x3
-#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT		2
-#define E4_YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_MASK	0x3
-#define E4_YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_SHIFT	4
-#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0EN_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT		6
-#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT		7
+#define YSTORM_FCOE_TASK_AG_CTX_CF0_MASK		0x3
+#define YSTORM_FCOE_TASK_AG_CTX_CF0_SHIFT		0
+#define YSTORM_FCOE_TASK_AG_CTX_CF1_MASK		0x3
+#define YSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT		2
+#define YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_MASK	0x3
+#define YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_SHIFT	4
+#define YSTORM_FCOE_TASK_AG_CTX_CF0EN_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT		6
+#define YSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT		7
 	u8 flags2;
-#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT4_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT4_SHIFT		0
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT	1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT	2
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT	3
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT	4
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT	5
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT	6
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK		0x1
-#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT	7
+#define YSTORM_FCOE_TASK_AG_CTX_BIT4_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_BIT4_SHIFT		0
+#define YSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT	1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT	2
+#define YSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT	3
+#define YSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT	4
+#define YSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT	5
+#define YSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT	6
+#define YSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK		0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT	7
 	u8 byte2;
 	__le32 reg0;
 	u8 byte3;
@@ -206,73 +206,73 @@ struct e4_ystorm_fcoe_task_ag_ctx {
 	__le32 reg2;
 };
 
-struct e4_tstorm_fcoe_task_ag_ctx {
+struct tstorm_fcoe_task_ag_ctx {
 	u8 reserved;
 	u8 byte1;
 	__le16 icid;
 	u8 flags0;
-#define E4_TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
-#define E4_TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
-#define E4_TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
-#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT1_MASK			0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT			5
-#define E4_TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_MASK		0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_SHIFT	6
-#define E4_TSTORM_FCOE_TASK_AG_CTX_VALID_MASK			0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_VALID_SHIFT			7
+#define TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
+#define TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define TSTORM_FCOE_TASK_AG_CTX_BIT1_MASK			0x1
+#define TSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT			5
+#define TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_MASK		0x1
+#define TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_SHIFT	6
+#define TSTORM_FCOE_TASK_AG_CTX_VALID_MASK			0x1
+#define TSTORM_FCOE_TASK_AG_CTX_VALID_SHIFT			7
 	u8 flags1;
-#define E4_TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_MASK	0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_SHIFT	0
-#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT5_MASK		0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT5_SHIFT		1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_MASK	0x3
-#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_SHIFT	2
-#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_MASK	0x3
-#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_SHIFT	4
-#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2_MASK		0x3
-#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT		6
+#define TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_MASK	0x1
+#define TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_SHIFT	0
+#define TSTORM_FCOE_TASK_AG_CTX_BIT5_MASK		0x1
+#define TSTORM_FCOE_TASK_AG_CTX_BIT5_SHIFT		1
+#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_MASK	0x3
+#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_SHIFT	2
+#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_MASK	0x3
+#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_SHIFT	4
+#define TSTORM_FCOE_TASK_AG_CTX_CF2_MASK		0x3
+#define TSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT		6
 	u8 flags2;
-#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_MASK		0x3
-#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_SHIFT		0
-#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK		0x3
-#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT		2
-#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_MASK		0x3
-#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_SHIFT		4
-#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_MASK		0x3
-#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_SHIFT	6
+#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_MASK		0x3
+#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_SHIFT		0
+#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK		0x3
+#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT		2
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_MASK		0x3
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_SHIFT		4
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_MASK		0x3
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_SHIFT	6
 	u8 flags3;
-#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_MASK		0x3
-#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_SHIFT		0
-#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_MASK	0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_SHIFT	2
-#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_MASK		0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_SHIFT		3
-#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK			0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT			4
-#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_MASK	0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_SHIFT	5
-#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK	0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT	6
-#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_MASK		0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_SHIFT		7
+#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_MASK		0x3
+#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_SHIFT		0
+#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_MASK	0x1
+#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_SHIFT	2
+#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_MASK		0x1
+#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_SHIFT		3
+#define TSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK			0x1
+#define TSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT			4
+#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_MASK	0x1
+#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_SHIFT	5
+#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK	0x1
+#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT	6
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_MASK		0x1
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_SHIFT		7
 	u8 flags4;
-#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_MASK	0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_SHIFT	0
-#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_MASK	0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_SHIFT	1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK			0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT		2
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK			0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT		3
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK			0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT		4
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK			0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT		5
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK			0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT		6
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK			0x1
-#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT		7
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_MASK	0x1
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_SHIFT	0
+#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_MASK	0x1
+#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_SHIFT	1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK			0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT		2
+#define TSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK			0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT		3
+#define TSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK			0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT		4
+#define TSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK			0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT		5
+#define TSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK			0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT		6
+#define TSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK			0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT		7
 	u8 cleanup_state;
 	__le16 last_sent_tid;
 	__le32 rec_rr_tov_exp_timeout;
@@ -352,49 +352,49 @@ struct tstorm_fcoe_task_st_ctx {
 	struct fcoe_tstorm_fcoe_task_st_ctx_read_only read_only;
 };
 
-struct e4_mstorm_fcoe_task_ag_ctx {
+struct mstorm_fcoe_task_ag_ctx {
 	u8 byte0;
 	u8 byte1;
 	__le16 icid;
 	u8 flags0;
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
-#define E4_MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_MASK		0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_SHIFT		5
-#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT2_MASK			0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT			6
-#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT3_MASK			0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT			7
+#define MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
+#define MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_MASK		0x1
+#define MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_SHIFT		5
+#define MSTORM_FCOE_TASK_AG_CTX_BIT2_MASK			0x1
+#define MSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT			6
+#define MSTORM_FCOE_TASK_AG_CTX_BIT3_MASK			0x1
+#define MSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT			7
 	u8 flags1;
-#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK		0x3
-#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT		0
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1_MASK			0x3
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT			2
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2_MASK			0x3
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT			4
-#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK	0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT	6
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK			0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT			7
+#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK		0x3
+#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT		0
+#define MSTORM_FCOE_TASK_AG_CTX_CF1_MASK			0x3
+#define MSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT			2
+#define MSTORM_FCOE_TASK_AG_CTX_CF2_MASK			0x3
+#define MSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT			4
+#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK	0x1
+#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT	6
+#define MSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK			0x1
+#define MSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT			7
 	u8 flags2;
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK			0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT			0
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK			0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT		1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK			0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT		2
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK			0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT		3
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK			0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT		4
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK			0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT		5
-#define E4_MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_MASK	0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_SHIFT	6
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK			0x1
-#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT		7
+#define MSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK			0x1
+#define MSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT			0
+#define MSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK			0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT		1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK			0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT		2
+#define MSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK			0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT		3
+#define MSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK			0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT		4
+#define MSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK			0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT		5
+#define MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_MASK	0x1
+#define MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_SHIFT	6
+#define MSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK			0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT		7
 	u8 cleanup_state;
 	__le32 received_bytes;
 	u8 byte3;
@@ -440,56 +440,56 @@ struct mstorm_fcoe_task_st_ctx {
 	struct scsi_cached_sges data_desc;
 };
 
-struct e4_ustorm_fcoe_task_ag_ctx {
+struct ustorm_fcoe_task_ag_ctx {
 	u8 reserved;
 	u8 byte1;
 	__le16 icid;
 	u8 flags0;
-#define E4_USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
-#define E4_USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
-#define E4_USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
-#define E4_USTORM_FCOE_TASK_AG_CTX_BIT1_MASK			0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT			5
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF0_MASK			0x3
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF0_SHIFT			6
+#define USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
+#define USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define USTORM_FCOE_TASK_AG_CTX_BIT1_MASK			0x1
+#define USTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT			5
+#define USTORM_FCOE_TASK_AG_CTX_CF0_MASK			0x3
+#define USTORM_FCOE_TASK_AG_CTX_CF0_SHIFT			6
 	u8 flags1;
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF1_MASK		0x3
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF1_SHIFT		0
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF2_MASK		0x3
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF2_SHIFT		2
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF3_MASK		0x3
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF3_SHIFT		4
-#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_MASK	0x3
-#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_SHIFT	6
+#define USTORM_FCOE_TASK_AG_CTX_CF1_MASK		0x3
+#define USTORM_FCOE_TASK_AG_CTX_CF1_SHIFT		0
+#define USTORM_FCOE_TASK_AG_CTX_CF2_MASK		0x3
+#define USTORM_FCOE_TASK_AG_CTX_CF2_SHIFT		2
+#define USTORM_FCOE_TASK_AG_CTX_CF3_MASK		0x3
+#define USTORM_FCOE_TASK_AG_CTX_CF3_SHIFT		4
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_MASK	0x3
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_SHIFT	6
 	u8 flags2;
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF0EN_MASK			0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT			0
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF1EN_MASK			0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT			1
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF2EN_MASK			0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT			2
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF3EN_MASK			0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_CF3EN_SHIFT			3
-#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK		0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT	4
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK			0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT		5
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK			0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT		6
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK			0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT		7
+#define USTORM_FCOE_TASK_AG_CTX_CF0EN_MASK			0x1
+#define USTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT			0
+#define USTORM_FCOE_TASK_AG_CTX_CF1EN_MASK			0x1
+#define USTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT			1
+#define USTORM_FCOE_TASK_AG_CTX_CF2EN_MASK			0x1
+#define USTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT			2
+#define USTORM_FCOE_TASK_AG_CTX_CF3EN_MASK			0x1
+#define USTORM_FCOE_TASK_AG_CTX_CF3EN_SHIFT			3
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK		0x1
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT	4
+#define USTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK			0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT		5
+#define USTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK			0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT		6
+#define USTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK			0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT		7
 	u8 flags3;
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK		0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT	0
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK		0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT	1
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK		0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT	2
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK		0x1
-#define E4_USTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT	3
-#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_MASK	0xF
-#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT	4
+#define USTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK		0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT	0
+#define USTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK		0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT	1
+#define USTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK		0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT	2
+#define USTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK		0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT	3
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_MASK	0xF
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT	4
 	__le32 dif_err_intervals;
 	__le32 dif_error_1st_interval;
 	__le32 global_cq_num;
@@ -499,18 +499,18 @@ struct e4_ustorm_fcoe_task_ag_ctx {
 };
 
 /* FCoE task context */
-struct e4_fcoe_task_context {
+struct fcoe_task_context {
 	struct ystorm_fcoe_task_st_ctx ystorm_st_context;
 	struct regpair ystorm_st_padding[2];
 	struct tdif_task_context tdif_context;
-	struct e4_ystorm_fcoe_task_ag_ctx ystorm_ag_context;
-	struct e4_tstorm_fcoe_task_ag_ctx tstorm_ag_context;
+	struct ystorm_fcoe_task_ag_ctx ystorm_ag_context;
+	struct tstorm_fcoe_task_ag_ctx tstorm_ag_context;
 	struct timers_context timer_context;
 	struct tstorm_fcoe_task_st_ctx tstorm_st_context;
 	struct regpair tstorm_st_padding[2];
-	struct e4_mstorm_fcoe_task_ag_ctx mstorm_ag_context;
+	struct mstorm_fcoe_task_ag_ctx mstorm_ag_context;
 	struct mstorm_fcoe_task_st_ctx mstorm_st_context;
-	struct e4_ustorm_fcoe_task_ag_ctx ustorm_ag_context;
+	struct ustorm_fcoe_task_ag_ctx ustorm_ag_context;
 	struct rdif_task_context rdif_context;
 };
 
diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h
index 157019f716f1..1a60285a01e3 100644
--- a/include/linux/qed/iscsi_common.h
+++ b/include/linux/qed/iscsi_common.h
@@ -714,49 +714,49 @@ struct ystorm_iscsi_task_st_ctx {
 	union iscsi_task_hdr pdu_hdr;
 };
 
-struct e4_ystorm_iscsi_task_ag_ctx {
+struct ystorm_iscsi_task_ag_ctx {
 	u8 reserved;
 	u8 byte1;
 	__le16 word0;
 	u8 flags0;
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_MASK	0xF
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_SHIFT	0
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT0_MASK		0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT0_SHIFT		4
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK		0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT		5
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_VALID_MASK		0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT		6
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_MASK   0x1	/* bit3 */
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_SHIFT  7
+#define YSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_MASK	0xF
+#define YSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_SHIFT	0
+#define YSTORM_ISCSI_TASK_AG_CTX_BIT0_MASK		0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_BIT0_SHIFT		4
+#define YSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK		0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT		5
+#define YSTORM_ISCSI_TASK_AG_CTX_VALID_MASK		0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT		6
+#define YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_MASK   0x1	/* bit3 */
+#define YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_SHIFT  7
 	u8 flags1;
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0_MASK		0x3
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0_SHIFT		0
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1_MASK		0x3
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT		2
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF2SPECIAL_MASK	0x3
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF2SPECIAL_SHIFT	4
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0EN_MASK		0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0EN_SHIFT		6
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK		0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT		7
+#define YSTORM_ISCSI_TASK_AG_CTX_CF0_MASK		0x3
+#define YSTORM_ISCSI_TASK_AG_CTX_CF0_SHIFT		0
+#define YSTORM_ISCSI_TASK_AG_CTX_CF1_MASK		0x3
+#define YSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT		2
+#define YSTORM_ISCSI_TASK_AG_CTX_CF2SPECIAL_MASK	0x3
+#define YSTORM_ISCSI_TASK_AG_CTX_CF2SPECIAL_SHIFT	4
+#define YSTORM_ISCSI_TASK_AG_CTX_CF0EN_MASK		0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_CF0EN_SHIFT		6
+#define YSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK		0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT		7
 	u8 flags2;
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT4_MASK		0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT4_SHIFT		0
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK	0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT	1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK	0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT	2
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK	0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT	3
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK	0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT	4
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK	0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT	5
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK	0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT	6
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK	0x1
-#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT	7
+#define YSTORM_ISCSI_TASK_AG_CTX_BIT4_MASK		0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_BIT4_SHIFT		0
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK	0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT	1
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK	0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT	2
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK	0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT	3
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK	0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT	4
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK	0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT	5
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK	0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT	6
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK	0x1
+#define YSTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT	7
 	u8 byte2;
 	__le32 TTT;
 	u8 byte3;
@@ -764,49 +764,49 @@ struct e4_ystorm_iscsi_task_ag_ctx {
 	__le16 word1;
 };
 
-struct e4_mstorm_iscsi_task_ag_ctx {
+struct mstorm_iscsi_task_ag_ctx {
 	u8 cdu_validation;
 	u8 byte1;
 	__le16 task_cid;
 	u8 flags0;
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_MASK	0xF
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK	0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT	5
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_MASK			0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT			6
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_MASK	0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_SHIFT	7
+#define MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_MASK	0xF
+#define MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK	0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT	5
+#define MSTORM_ISCSI_TASK_AG_CTX_VALID_MASK			0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT			6
+#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_MASK	0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_SHIFT	7
 	u8 flags1;
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_MASK	0x3
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_SHIFT	0
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1_MASK			0x3
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT			2
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2_MASK			0x3
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2_SHIFT			4
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_EN_MASK	0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_EN_SHIFT	6
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK			0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT			7
+#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_MASK	0x3
+#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_SHIFT	0
+#define MSTORM_ISCSI_TASK_AG_CTX_CF1_MASK			0x3
+#define MSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT			2
+#define MSTORM_ISCSI_TASK_AG_CTX_CF2_MASK			0x3
+#define MSTORM_ISCSI_TASK_AG_CTX_CF2_SHIFT			4
+#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_EN_MASK	0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_EN_SHIFT	6
+#define MSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK			0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT			7
 	u8 flags2;
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2EN_MASK		0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2EN_SHIFT		0
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK	0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT	1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK	0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT	2
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK	0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT	3
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK	0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT	4
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK	0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT	5
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK	0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT	6
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK	0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT	7
+#define MSTORM_ISCSI_TASK_AG_CTX_CF2EN_MASK		0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_CF2EN_SHIFT		0
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK	0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT	1
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK	0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT	2
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK	0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT	3
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK	0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT	4
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK	0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT	5
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK	0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT	6
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK	0x1
+#define MSTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT	7
 	u8 byte2;
 	__le32 reg0;
 	u8 byte3;
@@ -814,56 +814,56 @@ struct e4_mstorm_iscsi_task_ag_ctx {
 	__le16 word1;
 };
 
-struct e4_ustorm_iscsi_task_ag_ctx {
+struct ustorm_iscsi_task_ag_ctx {
 	u8 reserved;
 	u8 state;
 	__le16 icid;
 	u8 flags0;
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_MASK	0xF
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
-#define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK     0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT    5
-#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_MASK		0x3
-#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_SHIFT		6
+#define USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_MASK	0xF
+#define USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK     0x1
+#define USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT    5
+#define USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_MASK		0x3
+#define USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_SHIFT		6
 	u8 flags1;
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RESERVED1_MASK	0x3
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RESERVED1_SHIFT	0
-#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_MASK	0x3
-#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_SHIFT	2
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3_MASK		0x3
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3_SHIFT		4
-#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_MASK	0x3
-#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_SHIFT	6
+#define USTORM_ISCSI_TASK_AG_CTX_RESERVED1_MASK	0x3
+#define USTORM_ISCSI_TASK_AG_CTX_RESERVED1_SHIFT	0
+#define USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_MASK	0x3
+#define USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_SHIFT	2
+#define USTORM_ISCSI_TASK_AG_CTX_CF3_MASK		0x3
+#define USTORM_ISCSI_TASK_AG_CTX_CF3_SHIFT		4
+#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_MASK	0x3
+#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_SHIFT	6
 	u8 flags2;
-#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_EN_MASK	0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_EN_SHIFT	0
-#define E4_USTORM_ISCSI_TASK_AG_CTX_DISABLE_DATA_ACKED_MASK	0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_DISABLE_DATA_ACKED_SHIFT	1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_EN_MASK		0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_EN_SHIFT		2
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3EN_MASK			0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3EN_SHIFT			3
-#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK	0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT	4
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_MASK	0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_SHIFT	5
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK		0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT		6
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_MASK	0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_SHIFT	7
+#define USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_EN_MASK	0x1
+#define USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_EN_SHIFT	0
+#define USTORM_ISCSI_TASK_AG_CTX_DISABLE_DATA_ACKED_MASK	0x1
+#define USTORM_ISCSI_TASK_AG_CTX_DISABLE_DATA_ACKED_SHIFT	1
+#define USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_EN_MASK		0x1
+#define USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_EN_SHIFT		2
+#define USTORM_ISCSI_TASK_AG_CTX_CF3EN_MASK			0x1
+#define USTORM_ISCSI_TASK_AG_CTX_CF3EN_SHIFT			3
+#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK	0x1
+#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT	4
+#define USTORM_ISCSI_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_MASK	0x1
+#define USTORM_ISCSI_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_SHIFT	5
+#define USTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK		0x1
+#define USTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT		6
+#define USTORM_ISCSI_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_MASK	0x1
+#define USTORM_ISCSI_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_SHIFT	7
 	u8 flags3;
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK		0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT		0
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK		0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT		1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK		0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT		2
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK		0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT		3
-#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_TYPE_MASK		0xF
-#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT	4
+#define USTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK		0x1
+#define USTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT		0
+#define USTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK		0x1
+#define USTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT		1
+#define USTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK		0x1
+#define USTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT		2
+#define USTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK		0x1
+#define USTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT		3
+#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_TYPE_MASK		0xF
+#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT	4
 	__le32 dif_err_intervals;
 	__le32 dif_error_1st_interval;
 	__le32 rcv_cont_len;
@@ -952,14 +952,14 @@ struct ustorm_iscsi_task_st_ctx {
 };
 
 /* iscsi task context */
-struct e4_iscsi_task_context {
+struct iscsi_task_context {
 	struct ystorm_iscsi_task_st_ctx ystorm_st_context;
-	struct e4_ystorm_iscsi_task_ag_ctx ystorm_ag_context;
+	struct ystorm_iscsi_task_ag_ctx ystorm_ag_context;
 	struct regpair ystorm_ag_padding[2];
 	struct tdif_task_context tdif_context;
-	struct e4_mstorm_iscsi_task_ag_ctx mstorm_ag_context;
+	struct mstorm_iscsi_task_ag_ctx mstorm_ag_context;
 	struct regpair mstorm_ag_padding[2];
-	struct e4_ustorm_iscsi_task_ag_ctx ustorm_ag_context;
+	struct ustorm_iscsi_task_ag_ctx ustorm_ag_context;
 	struct mstorm_iscsi_task_st_ctx mstorm_st_context;
 	struct ustorm_iscsi_task_st_ctx ustorm_st_context;
 	struct rdif_task_context rdif_context;
@@ -1431,73 +1431,73 @@ struct ystorm_iscsi_stats_drv {
 	struct regpair iscsi_tx_tcp_pkt_cnt;
 };
 
-struct e4_tstorm_iscsi_task_ag_ctx {
+struct tstorm_iscsi_task_ag_ctx {
 	u8 byte0;
 	u8 byte1;
 	__le16 word0;
 	u8 flags0;
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_MASK	0xF
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_SHIFT	0
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT0_MASK		0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT0_SHIFT		4
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK		0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT		5
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT2_MASK		0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT2_SHIFT		6
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT3_MASK		0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT3_SHIFT		7
+#define TSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_MASK	0xF
+#define TSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_SHIFT	0
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT0_MASK		0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT0_SHIFT		4
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK		0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT		5
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT2_MASK		0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT2_SHIFT		6
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT3_MASK		0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT3_SHIFT		7
 	u8 flags1;
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT4_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT4_SHIFT	0
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT5_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT5_SHIFT	1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0_MASK	0x3
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0_SHIFT	2
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1_MASK	0x3
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT	4
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2_MASK	0x3
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2_SHIFT	6
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT4_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT4_SHIFT	0
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT5_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_BIT5_SHIFT	1
+#define TSTORM_ISCSI_TASK_AG_CTX_CF0_MASK	0x3
+#define TSTORM_ISCSI_TASK_AG_CTX_CF0_SHIFT	2
+#define TSTORM_ISCSI_TASK_AG_CTX_CF1_MASK	0x3
+#define TSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT	4
+#define TSTORM_ISCSI_TASK_AG_CTX_CF2_MASK	0x3
+#define TSTORM_ISCSI_TASK_AG_CTX_CF2_SHIFT	6
 	u8 flags2;
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3_MASK	0x3
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3_SHIFT	0
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4_MASK	0x3
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4_SHIFT	2
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5_MASK	0x3
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5_SHIFT	4
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6_MASK	0x3
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6_SHIFT	6
+#define TSTORM_ISCSI_TASK_AG_CTX_CF3_MASK	0x3
+#define TSTORM_ISCSI_TASK_AG_CTX_CF3_SHIFT	0
+#define TSTORM_ISCSI_TASK_AG_CTX_CF4_MASK	0x3
+#define TSTORM_ISCSI_TASK_AG_CTX_CF4_SHIFT	2
+#define TSTORM_ISCSI_TASK_AG_CTX_CF5_MASK	0x3
+#define TSTORM_ISCSI_TASK_AG_CTX_CF5_SHIFT	4
+#define TSTORM_ISCSI_TASK_AG_CTX_CF6_MASK	0x3
+#define TSTORM_ISCSI_TASK_AG_CTX_CF6_SHIFT	6
 	u8 flags3;
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7_MASK	0x3
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7_SHIFT	0
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0EN_SHIFT	2
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT	3
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2EN_SHIFT	4
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3EN_SHIFT	5
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4EN_SHIFT	6
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5EN_SHIFT	7
+#define TSTORM_ISCSI_TASK_AG_CTX_CF7_MASK	0x3
+#define TSTORM_ISCSI_TASK_AG_CTX_CF7_SHIFT	0
+#define TSTORM_ISCSI_TASK_AG_CTX_CF0EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_CF0EN_SHIFT	2
+#define TSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT	3
+#define TSTORM_ISCSI_TASK_AG_CTX_CF2EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_CF2EN_SHIFT	4
+#define TSTORM_ISCSI_TASK_AG_CTX_CF3EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_CF3EN_SHIFT	5
+#define TSTORM_ISCSI_TASK_AG_CTX_CF4EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_CF4EN_SHIFT	6
+#define TSTORM_ISCSI_TASK_AG_CTX_CF5EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_CF5EN_SHIFT	7
 	u8 flags4;
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6EN_MASK		0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6EN_SHIFT		0
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7EN_MASK		0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7EN_SHIFT		1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT	2
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT	3
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT	4
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT	5
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT	6
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK	0x1
-#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT	7
+#define TSTORM_ISCSI_TASK_AG_CTX_CF6EN_MASK		0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_CF6EN_SHIFT		0
+#define TSTORM_ISCSI_TASK_AG_CTX_CF7EN_MASK		0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_CF7EN_SHIFT		1
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT	2
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT	3
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT	4
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT	5
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT	6
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK	0x1
+#define TSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT	7
 	u8 byte2;
 	__le16 word1;
 	__le32 reg0;
diff --git a/include/linux/qed/nvmetcp_common.h b/include/linux/qed/nvmetcp_common.h
index 5a2ab0606308..cc7c7481a0e0 100644
--- a/include/linux/qed/nvmetcp_common.h
+++ b/include/linux/qed/nvmetcp_common.h
@@ -410,7 +410,7 @@ struct e5_ystorm_nvmetcp_task_ag_ctx {
 	u8 byte2;
 	u8 byte3;
 	u8 byte4;
-	u8 e4_reserved7;
+	u8 reserved7;
 };
 
 struct e5_mstorm_nvmetcp_task_ag_ctx {
@@ -445,7 +445,7 @@ struct e5_mstorm_nvmetcp_task_ag_ctx {
 	u8 byte2;
 	u8 byte3;
 	u8 byte4;
-	u8 e4_reserved7;
+	u8 reserved7;
 };
 
 struct e5_ustorm_nvmetcp_task_ag_ctx {
@@ -489,17 +489,17 @@ struct e5_ustorm_nvmetcp_task_ag_ctx {
 #define E5_USTORM_NVMETCP_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_SHIFT 7
 	u8 flags3;
 	u8 flags4;
-#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED5_MASK 0x3
-#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED5_SHIFT 0
-#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED6_MASK 0x1
-#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED6_SHIFT 2
-#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED7_MASK 0x1
-#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED7_SHIFT 3
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED5_MASK 0x3
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED5_SHIFT 0
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED6_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED6_SHIFT 2
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED7_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED7_SHIFT 3
 #define E5_USTORM_NVMETCP_TASK_AG_CTX_DIF_ERROR_TYPE_MASK 0xF
 #define E5_USTORM_NVMETCP_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT 4
 	u8 byte2;
 	u8 byte3;
-	u8 e4_reserved8;
+	u8 reserved8;
 	__le32 dif_err_intervals;
 	__le32 dif_error_1st_interval;
 	__le32 rcv_cont_len;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index f39451aaaeec..4dcd0d37a521 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -588,7 +588,7 @@ enum qed_int_mode {
 };
 
 struct qed_sb_info {
-	struct status_block_e4 *sb_virt;
+	struct status_block *sb_virt;
 	dma_addr_t sb_phys;
 	u32 sb_ack; /* Last given ack */
 	u16 igu_sb_id;
@@ -613,7 +613,6 @@ enum qed_hw_err_type {
 enum qed_dev_type {
 	QED_DEV_TYPE_BB,
 	QED_DEV_TYPE_AH,
-	QED_DEV_TYPE_E5,
 };
 
 struct qed_dev_info {
@@ -1411,7 +1410,7 @@ static inline u16 qed_sb_update_sb_idx(struct qed_sb_info *sb_info)
 	u16 rc = 0;
 
 	prod = le32_to_cpu(sb_info->sb_virt->prod_index) &
-	       STATUS_BLOCK_E4_PROD_INDEX_MASK;
+	       STATUS_BLOCK_PROD_INDEX_MASK;
 	if (sb_info->sb_ack != prod) {
 		sb_info->sb_ack = prod;
 		rc |= QED_SB_IDX;
-- 
cgit v1.2.3


From 484563e230a8c68ab342080b41b5a5e2ce9621ef Mon Sep 17 00:00:00 2001
From: Prabhakar Kushwaha <pkushwaha@marvell.com>
Date: Mon, 4 Oct 2021 09:58:42 +0300
Subject: qed: Update common_hsi for FW ver 8.59.1.0

The common_hsi.h has been updated for FW version 8.59.1.0 with below
changes.
  - FW and Tools version.
  - New structures related to search table, packet duplication.
  - Structure for doorbell address for legacy mode without DEM.
  - Enhanced union rdma_eqe_data for RoCE Suspend Event Data.
  - New defines.

This patch also fixes the existing checkpatch warnings and few important
checks.

Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/common_hsi.h | 113 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 105 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 3742d1f7d1f7..827624840ee2 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
 /* QLogic qed NIC Driver
  * Copyright (c) 2015-2016  QLogic Corporation
- * Copyright (c) 2019-2020 Marvell International Ltd.
+ * Copyright (c) 2019-2021 Marvell International Ltd.
  */
 
 #ifndef _COMMON_HSI_H
@@ -47,10 +47,10 @@
 #define ISCSI_CDU_TASK_SEG_TYPE			0
 #define FCOE_CDU_TASK_SEG_TYPE			0
 #define RDMA_CDU_TASK_SEG_TYPE			1
+#define ETH_CDU_TASK_SEG_TYPE			2
 
 #define FW_ASSERT_GENERAL_ATTN_IDX		32
 
-
 /* Queue Zone sizes in bytes */
 #define TSTORM_QZONE_SIZE	8
 #define MSTORM_QZONE_SIZE	16
@@ -60,9 +60,12 @@
 #define PSTORM_QZONE_SIZE	0
 
 #define MSTORM_VF_ZONE_DEFAULT_SIZE_LOG		7
-#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DEFAULT	16
-#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DOUBLE	48
-#define ETH_MAX_NUM_RX_QUEUES_PER_VF_QUAD	112
+#define ETH_MAX_RXQ_VF_DEFAULT 16
+#define ETH_MAX_RXQ_VF_DOUBLE 48
+#define ETH_MAX_RXQ_VF_QUAD 112
+
+#define ETH_RGSRC_CTX_SIZE			6
+#define ETH_TGSRC_CTX_SIZE			6
 
 /********************************/
 /* CORE (LIGHT L2) FW CONSTANTS */
@@ -89,8 +92,8 @@
 #define MAX_NUM_LL2_TX_STATS_COUNTERS  48
 
 #define FW_MAJOR_VERSION	8
-#define FW_MINOR_VERSION	42
-#define FW_REVISION_VERSION	2
+#define FW_MINOR_VERSION	59
+#define FW_REVISION_VERSION	1
 #define FW_ENGINEERING_VERSION	0
 
 /***********************/
@@ -112,6 +115,7 @@
 #define MAX_NUM_VFS	(MAX_NUM_VFS_K2)
 
 #define MAX_NUM_FUNCTIONS_BB	(MAX_NUM_PFS_BB + MAX_NUM_VFS_BB)
+#define MAX_NUM_FUNCTIONS_K2    (MAX_NUM_PFS_K2 + MAX_NUM_VFS_K2)
 
 #define MAX_FUNCTION_NUMBER_BB	(MAX_NUM_PFS + MAX_NUM_VFS_BB)
 #define MAX_FUNCTION_NUMBER_K2  (MAX_NUM_PFS + MAX_NUM_VFS_K2)
@@ -144,7 +148,7 @@
 #define GTT_DWORD_SIZE		BIT(GTT_DWORD_SIZE_BITS)
 
 /* Tools Version */
-#define TOOLS_VERSION	10
+#define TOOLS_VERSION 11
 
 /*****************/
 /* CDU CONSTANTS */
@@ -162,6 +166,7 @@
 #define CDU_CONTEXT_VALIDATION_CFG_USE_REGION			(3)
 #define CDU_CONTEXT_VALIDATION_CFG_USE_CID			(4)
 #define CDU_CONTEXT_VALIDATION_CFG_USE_ACTIVE			(5)
+#define CDU_CONTEXT_VALIDATION_DEFAULT_CFG			(0x3d)
 
 /*****************/
 /* DQ CONSTANTS  */
@@ -302,6 +307,9 @@
 /* PWM address mapping */
 #define DQ_PWM_OFFSET_DPM_BASE		0x0
 #define DQ_PWM_OFFSET_DPM_END		0x27
+#define DQ_PWM_OFFSET_XCM32_24ICID_BASE 0x28
+#define DQ_PWM_OFFSET_UCM32_24ICID_BASE 0x30
+#define DQ_PWM_OFFSET_TCM32_24ICID_BASE 0x38
 #define DQ_PWM_OFFSET_XCM16_BASE	0x40
 #define DQ_PWM_OFFSET_XCM32_BASE	0x44
 #define DQ_PWM_OFFSET_UCM16_BASE	0x48
@@ -325,6 +333,13 @@
 #define DQ_PWM_OFFSET_TCM_LL2_PROD_UPDATE \
 	(DQ_PWM_OFFSET_TCM32_BASE + DQ_TCM_AGG_VAL_SEL_REG9 - 4)
 
+#define DQ_PWM_OFFSET_XCM_RDMA_24B_ICID_SQ_PROD \
+	(DQ_PWM_OFFSET_XCM32_24ICID_BASE + 2)
+#define DQ_PWM_OFFSET_UCM_RDMA_24B_ICID_CQ_CONS_32BIT \
+	(DQ_PWM_OFFSET_UCM32_24ICID_BASE + 4)
+#define DQ_PWM_OFFSET_TCM_ROCE_24B_ICID_RQ_PROD \
+	(DQ_PWM_OFFSET_TCM32_24ICID_BASE + 1)
+
 #define	DQ_REGION_SHIFT			(12)
 
 /* DPM */
@@ -360,6 +375,7 @@
 
 /* Number of global Vport/QCN rate limiters */
 #define MAX_QM_GLOBAL_RLS	256
+#define COMMON_MAX_QM_GLOBAL_RLS MAX_QM_GLOBAL_RLS
 
 /* QM registers data */
 #define QM_LINE_CRD_REG_WIDTH		16
@@ -700,6 +716,13 @@ enum mf_mode {
 	MAX_MF_MODE
 };
 
+/* Per protocol packet duplication enable bit vector. If set, duplicate
+ * offloaded traffic to LL2 debug queueu.
+ */
+struct offload_pkt_dup_enable {
+	__le16 enable_vector;
+};
+
 /* Per-protocol connection types */
 enum protocol_type {
 	PROTOCOLID_TCP_ULP,
@@ -717,6 +740,12 @@ enum protocol_type {
 	MAX_PROTOCOL_TYPE
 };
 
+/* Pstorm packet duplication config */
+struct pstorm_pkt_dup_cfg {
+	struct offload_pkt_dup_enable enable;
+	__le16 reserved[3];
+};
+
 struct regpair {
 	__le32 lo;
 	__le32 hi;
@@ -728,10 +757,24 @@ struct rdma_eqe_destroy_qp {
 	u8 reserved[4];
 };
 
+/* RoCE Suspend Event Data */
+struct rdma_eqe_suspend_qp {
+	__le32 cid;
+	u8 reserved[4];
+};
+
 /* RDMA Event Data Union */
 union rdma_eqe_data {
 	struct regpair async_handle;
 	struct rdma_eqe_destroy_qp rdma_destroy_qp_data;
+	struct rdma_eqe_suspend_qp rdma_suspend_qp_data;
+};
+
+/* Tstorm packet duplication config */
+struct tstorm_pkt_dup_cfg {
+	struct offload_pkt_dup_enable enable;
+	__le16 reserved;
+	__le32 cid;
 };
 
 struct tstorm_queue_zone {
@@ -891,6 +934,15 @@ struct db_legacy_addr {
 #define DB_LEGACY_ADDR_ICID_SHIFT	5
 };
 
+/* Structure for doorbell address, in legacy mode, without DEMS */
+struct db_legacy_wo_dems_addr {
+	__le32 addr;
+#define DB_LEGACY_WO_DEMS_ADDR_RESERVED0_MASK   0x3
+#define DB_LEGACY_WO_DEMS_ADDR_RESERVED0_SHIFT  0
+#define DB_LEGACY_WO_DEMS_ADDR_ICID_MASK        0x3FFFFFFF
+#define DB_LEGACY_WO_DEMS_ADDR_ICID_SHIFT       2
+};
+
 /* Structure for doorbell address, in PWM mode */
 struct db_pwm_addr {
 	__le32 addr;
@@ -906,6 +958,31 @@ struct db_pwm_addr {
 #define DB_PWM_ADDR_RESERVED1_SHIFT	28
 };
 
+/* Parameters to RDMA firmware, passed in EDPM doorbell */
+struct db_rdma_24b_icid_dpm_params {
+	__le32 params;
+#define DB_RDMA_24B_ICID_DPM_PARAMS_SIZE_MASK   0x3F
+#define DB_RDMA_24B_ICID_DPM_PARAMS_SIZE_SHIFT  0
+#define DB_RDMA_24B_ICID_DPM_PARAMS_DPM_TYPE_MASK       0x3
+#define DB_RDMA_24B_ICID_DPM_PARAMS_DPM_TYPE_SHIFT      6
+#define DB_RDMA_24B_ICID_DPM_PARAMS_OPCODE_MASK 0xFF
+#define DB_RDMA_24B_ICID_DPM_PARAMS_OPCODE_SHIFT        8
+#define DB_RDMA_24B_ICID_DPM_PARAMS_ICID_EXT_MASK       0xFF
+#define DB_RDMA_24B_ICID_DPM_PARAMS_ICID_EXT_SHIFT      16
+#define DB_RDMA_24B_ICID_DPM_PARAMS_INV_BYTE_CNT_MASK   0x7
+#define DB_RDMA_24B_ICID_DPM_PARAMS_INV_BYTE_CNT_SHIFT  24
+#define DB_RDMA_24B_ICID_DPM_PARAMS_EXT_ICID_MODE_EN_MASK       0x1
+#define DB_RDMA_24B_ICID_DPM_PARAMS_EXT_ICID_MODE_EN_SHIFT      27
+#define DB_RDMA_24B_ICID_DPM_PARAMS_COMPLETION_FLG_MASK 0x1
+#define DB_RDMA_24B_ICID_DPM_PARAMS_COMPLETION_FLG_SHIFT        28
+#define DB_RDMA_24B_ICID_DPM_PARAMS_S_FLG_MASK  0x1
+#define DB_RDMA_24B_ICID_DPM_PARAMS_S_FLG_SHIFT 29
+#define DB_RDMA_24B_ICID_DPM_PARAMS_RESERVED1_MASK      0x1
+#define DB_RDMA_24B_ICID_DPM_PARAMS_RESERVED1_SHIFT     30
+#define DB_RDMA_24B_ICID_DPM_PARAMS_CONN_TYPE_IS_IWARP_MASK     0x1
+#define DB_RDMA_24B_ICID_DPM_PARAMS_CONN_TYPE_IS_IWARP_SHIFT    31
+};
+
 /* Parameters to RDMA firmware, passed in EDPM doorbell */
 struct db_rdma_dpm_params {
 	__le32 params;
@@ -1220,6 +1297,26 @@ struct rdif_task_context {
 	__le32 reserved2;
 };
 
+/* Searcher Table struct */
+struct src_entry_header {
+	__le32 flags;
+#define SRC_ENTRY_HEADER_NEXT_PTR_TYPE_MASK     0x1
+#define SRC_ENTRY_HEADER_NEXT_PTR_TYPE_SHIFT    0
+#define SRC_ENTRY_HEADER_EMPTY_MASK     0x1
+#define SRC_ENTRY_HEADER_EMPTY_SHIFT    1
+#define SRC_ENTRY_HEADER_RESERVED_MASK  0x3FFFFFFF
+#define SRC_ENTRY_HEADER_RESERVED_SHIFT 2
+	__le32 magic_number;
+	struct regpair next_ptr;
+};
+
+/* Enumeration for address type */
+enum src_header_next_ptr_type_enum {
+	e_physical_addr,
+	e_logical_addr,
+	MAX_SRC_HEADER_NEXT_PTR_TYPE_ENUM
+};
+
 /* Status block structure */
 struct status_block {
 	__le16	pi_array[PIS_PER_SB];
-- 
cgit v1.2.3


From fe40a830dcded26f012739fd6dac0da9c805bc38 Mon Sep 17 00:00:00 2001
From: Prabhakar Kushwaha <pkushwaha@marvell.com>
Date: Mon, 4 Oct 2021 09:58:44 +0300
Subject: qed: Update qed_hsi.h for fw 8.59.1.0

The qed_hsi.h has been updated to support new FW version 8.59.1.0 with
changes.
 - Updates FW HSI (Hardware Software interface) structures.
 - Addition/update in function declaration and defines as per HSI.
 - Add generic infrastructure for FW error reporting as part of
   common event queue handling.
 - Move malicious VF error reporting to FW error reporting
   infrastructure.
 - Move consolidation queue initialization from FW context to ramrod
   message.

qed_hsi.h header file changes lead to change in many files to ensure
compilation.

This patch also fixes the existing checkpatch warnings and few important
checks.

Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/eth_common.h  | 1 +
 include/linux/qed/rdma_common.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/qed/eth_common.h b/include/linux/qed/eth_common.h
index cd1207ad4ada..c84e08bc6802 100644
--- a/include/linux/qed/eth_common.h
+++ b/include/linux/qed/eth_common.h
@@ -67,6 +67,7 @@
 /* Ethernet vport update constants */
 #define ETH_FILTER_RULES_COUNT		10
 #define ETH_RSS_IND_TABLE_ENTRIES_NUM	128
+#define ETH_RSS_IND_TABLE_MASK_SIZE_REGS    (ETH_RSS_IND_TABLE_ENTRIES_NUM / 32)
 #define ETH_RSS_KEY_SIZE_REGS		10
 #define ETH_RSS_ENGINE_NUM_K2		207
 #define ETH_RSS_ENGINE_NUM_BB		127
diff --git a/include/linux/qed/rdma_common.h b/include/linux/qed/rdma_common.h
index bab078b25834..6dfed163ab6c 100644
--- a/include/linux/qed/rdma_common.h
+++ b/include/linux/qed/rdma_common.h
@@ -27,6 +27,7 @@
 #define RDMA_MAX_PDS			(64 * 1024)
 #define RDMA_MAX_XRC_SRQS                       (1024)
 #define RDMA_MAX_SRQS                           (32 * 1024)
+#define RDMA_MAX_IRQ_ELEMS_IN_PAGE      (128)
 
 #define RDMA_NUM_STATISTIC_COUNTERS	MAX_NUM_VPORTS
 #define RDMA_NUM_STATISTIC_COUNTERS_K2	MAX_NUM_VPORTS_K2
-- 
cgit v1.2.3


From 3a6f5d0cbda37f24f60ca778bd1675125b7d594f Mon Sep 17 00:00:00 2001
From: Nikolay Assa <nassa@marvell.com>
Date: Mon, 4 Oct 2021 09:58:49 +0300
Subject: qed: Update TCP silly-window-syndrome timeout for iwarp, scsi

Update TCP silly-window-syndrome timeout, for the cases where
initiator's small TCP window size prevents FW from transmitting
packets on the connection. Timeout causes FW to retransmit
window probes if needed, preventing I/O stall if initiator ignores
first window probe.

Reviewed-by: Manish Rangankar <mrangankar@marvell.com>
Signed-off-by: Nikolay Assa <nassa@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_if.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 4dcd0d37a521..07f8d19421ab 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -24,6 +24,8 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <net/devlink.h>
 
+#define QED_TX_SWS_TIMER_DFLT  500
+
 enum dcbx_protocol_type {
 	DCBX_PROTOCOL_ISCSI,
 	DCBX_PROTOCOL_FCOE,
-- 
cgit v1.2.3


From a64aa0a8b991e3cae61ee8322cf936c00068a5c1 Mon Sep 17 00:00:00 2001
From: Prabhakar Kushwaha <pkushwaha@marvell.com>
Date: Mon, 4 Oct 2021 09:58:50 +0300
Subject: qed: Update the TCP active termination 2 MSL timer ("TIME_WAIT")

Initialize 2 MSL timeout value used for the TCP TIME_WAIT state to
non-zero default.

This patch also removes magic number from qedi/qedi_main.c.

Reviewed-by: Manish Rangankar <mrangankar@marvell.com>
Signed-off-by: Nikolay Assa <nassa@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_if.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 07f8d19421ab..ad220d5da18f 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -25,6 +25,7 @@
 #include <net/devlink.h>
 
 #define QED_TX_SWS_TIMER_DFLT  500
+#define QED_TWO_MSL_TIMER_DFLT 4000
 
 enum dcbx_protocol_type {
 	DCBX_PROTOCOL_ISCSI,
-- 
cgit v1.2.3


From a2c27a61b4335344c1bacaade61e9b2dc6e12a76 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 4 Oct 2021 12:03:28 +0100
Subject: net: phylink: add phylink_set_10g_modes() helper

Add a helper for setting 10Gigabit modes, so we have one central
place that sets all appropriate 10G modes for a driver.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 237291196ce2..f7b5ed06a815 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -484,6 +484,7 @@ int phylink_speed_up(struct phylink *pl);
 #define phylink_test(bm, mode)	__phylink_do_bit(test_bit, bm, mode)
 
 void phylink_set_port_modes(unsigned long *bits);
+void phylink_set_10g_modes(unsigned long *mask);
 void phylink_helper_basex_speed(struct phylink_link_state *state);
 
 void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs,
-- 
cgit v1.2.3


From 571e5c0efcb29c5dac8cf2949d3eed84ec43056c Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 19 May 2021 16:00:22 -0400
Subject: audit: add OPENAT2 record to list "how" info

Since the openat2(2) syscall uses a struct open_how pointer to communicate
its parameters they are not usefully recorded by the audit SYSCALL record's
four existing arguments.

Add a new audit record type OPENAT2 that reports the parameters in its
third argument, struct open_how with fields oflag, mode and resolve.

The new record in the context of an event would look like:
time->Wed Mar 17 16:28:53 2021
type=PROCTITLE msg=audit(1616012933.531:184): proctitle=
  73797363616C6C735F66696C652F6F70656E617432002F746D702F61756469742D
  7465737473756974652D737641440066696C652D6F70656E617432
type=PATH msg=audit(1616012933.531:184): item=1 name="file-openat2"
  inode=29 dev=00:1f mode=0100600 ouid=0 ogid=0 rdev=00:00
  obj=unconfined_u:object_r:user_tmp_t:s0 nametype=CREATE
  cap_fp=0 cap_fi=0 cap_fe=0 cap_fver=0 cap_frootid=0
type=PATH msg=audit(1616012933.531:184):
  item=0 name="/root/rgb/git/audit-testsuite/tests"
  inode=25 dev=00:1f mode=040700 ouid=0 ogid=0 rdev=00:00
  obj=unconfined_u:object_r:user_tmp_t:s0 nametype=PARENT
  cap_fp=0 cap_fi=0 cap_fe=0 cap_fver=0 cap_frootid=0
type=CWD msg=audit(1616012933.531:184):
  cwd="/root/rgb/git/audit-testsuite/tests"
type=OPENAT2 msg=audit(1616012933.531:184):
  oflag=0100302 mode=0600 resolve=0xa
type=SYSCALL msg=audit(1616012933.531:184): arch=c000003e syscall=437
  success=yes exit=4 a0=3 a1=7ffe315f1c53 a2=7ffe315f1550 a3=18
  items=2 ppid=528 pid=540 auid=0 uid=0 gid=0 euid=0 suid=0
  fsuid=0 egid=0 sgid=0 fsgid=0 tty=ttyS0 ses=1 comm="openat2"
  exe="/root/rgb/git/audit-testsuite/tests/syscalls_file/openat2"
  subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
  key="testsuite-1616012933-bjAUcEPO"

Link: https://lore.kernel.org/r/d23fbb89186754487850367224b060e26f9b7181.1621363275.git.rgb@redhat.com
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
[PM: tweak subject, wrap example, move AUDIT_OPENAT2 to 1337]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 5fbeeeb6b726..0e0eb4c0fa10 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -399,6 +399,7 @@ extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 				  const struct cred *old);
 extern void __audit_log_capset(const struct cred *new, const struct cred *old);
 extern void __audit_mmap_fd(int fd, int flags);
+extern void __audit_openat2_how(struct open_how *how);
 extern void __audit_log_kern_module(char *name);
 extern void __audit_fanotify(unsigned int response);
 extern void __audit_tk_injoffset(struct timespec64 offset);
@@ -495,6 +496,12 @@ static inline void audit_mmap_fd(int fd, int flags)
 		__audit_mmap_fd(fd, flags);
 }
 
+static inline void audit_openat2_how(struct open_how *how)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_openat2_how(how);
+}
+
 static inline void audit_log_kern_module(char *name)
 {
 	if (!audit_dummy_context())
@@ -646,6 +653,9 @@ static inline void audit_log_capset(const struct cred *new,
 static inline void audit_mmap_fd(int fd, int flags)
 { }
 
+static inline void audit_openat2_how(struct open_how *how)
+{ }
+
 static inline void audit_log_kern_module(char *name)
 {
 }
-- 
cgit v1.2.3


From 5249001d69a223811ad654884c6115d55158fd51 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Wed, 1 Sep 2021 19:05:02 +0300
Subject: net/mlx5: Bridge, mark reg_c1 when pushing VLAN

On ingress VLAN push also assign value 0x7FE to reg_c1 tunnel id+opts
bits (tunnel id 0, which is not a valid tunnel id, and option 0x7FE which
was reserved by one of previous patches in the series). In following patch
the reg value is matched on egress miss to restore the packet to its
original state by removing the VLAN before passing it to the software data
path.

Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Paul Blakey <paulb@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/eswitch.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 4ab5c1fc1270..97afcea39a7b 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -130,11 +130,20 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw,
 #define ESW_TUN_OPTS_MASK GENMASK(31 - ESW_TUN_ID_BITS - ESW_RESERVED_BITS, ESW_TUN_OPTS_OFFSET)
 #define ESW_TUN_MASK GENMASK(31 - ESW_RESERVED_BITS, ESW_TUN_OFFSET)
 #define ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT 0 /* 0 is not a valid tunnel id */
+#define ESW_TUN_ID_BRIDGE_INGRESS_PUSH_VLAN ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT
 /* 0x7FF is a reserved mapping */
 #define ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT GENMASK(ESW_TUN_OPTS_BITS - 1, 0)
 #define ESW_TUN_SLOW_TABLE_GOTO_VPORT ((ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT << ESW_TUN_OPTS_BITS) | \
 				       ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT)
 #define ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK ESW_TUN_OPTS_MASK
+/* 0x7FE is a reserved mapping for bridge ingress push vlan mark */
+#define ESW_TUN_OPTS_BRIDGE_INGRESS_PUSH_VLAN (ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT - 1)
+#define ESW_TUN_BRIDGE_INGRESS_PUSH_VLAN ((ESW_TUN_ID_BRIDGE_INGRESS_PUSH_VLAN << \
+					   ESW_TUN_OPTS_BITS) | \
+					  ESW_TUN_OPTS_BRIDGE_INGRESS_PUSH_VLAN)
+#define ESW_TUN_BRIDGE_INGRESS_PUSH_VLAN_MARK \
+	GENMASK(31 - ESW_TUN_ID_BITS - ESW_RESERVED_BITS, \
+		ESW_TUN_OPTS_OFFSET + 1)
 
 u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev);
 u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From 3663ad34bc707fc85492f4d83a313f5df84718d4 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Thu, 19 Aug 2021 16:18:57 +0300
Subject: net/mlx5: Shift control IRQ to the last index

Control IRQ is the first IRQ vector. This complicates handling of
completion irqs as we need to offset them by one.
in the next patch, there are scenarios where completion and control EQs
will share the same irq. for example: functions with single IRQ. To ease
such scenarios, we shift control IRQ to the end of the irq array.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e23417424373..0ca719c00824 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -59,6 +59,8 @@
 
 #define MLX5_ADEV_NAME "mlx5_core"
 
+#define MLX5_IRQ_EQ_CTRL (U8_MAX)
+
 enum {
 	MLX5_BOARD_ID_LEN = 64,
 };
-- 
cgit v1.2.3


From f891b7cdbdcda116fd26bbd706f91bd58567aa17 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Sun, 1 Aug 2021 12:08:49 +0300
Subject: net/mlx5: Enable single IRQ for PCI Function

Prior to this patch the driver requires two IRQs to function properly,
one required IRQ for control and at least one required IRQ for IO.

This requirement can be relaxed to one as the driver now allows
sharing of IRQs, so control and IO EQs can share the same irq.

This is needed for high scale amount of VFs.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/eq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index cea6ecb4b73e..ea3ff5a8ced3 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -4,7 +4,6 @@
 #ifndef MLX5_CORE_EQ_H
 #define MLX5_CORE_EQ_H
 
-#define MLX5_IRQ_VEC_COMP_BASE 1
 #define MLX5_NUM_CMD_EQE   (32)
 #define MLX5_NUM_ASYNC_EQE (0x1000)
 #define MLX5_NUM_SPARE_EQE (0x80)
-- 
cgit v1.2.3


From e68ce0faf29c7c268666e11e95bf27dca97d28b0 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 2 Sep 2021 14:47:48 +0200
Subject: mfd: hi6421-spmi-pmic: Cleanup drvdata to only include regmap

There are lots of fields in struct hi6421_spmi_pmic that aren't
used. As a matter of fact, only regmap is needed.

So, drop the struct as a whole, and set regmap as the drvdata.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/1828cb783b1ebca0b98bf0b3077d8701adb228f7.1630586862.git.mchehab+huawei@kernel.org
---
 include/linux/mfd/hi6421-spmi-pmic.h | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 include/linux/mfd/hi6421-spmi-pmic.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/hi6421-spmi-pmic.h b/include/linux/mfd/hi6421-spmi-pmic.h
deleted file mode 100644
index e5b8dbf828b6..000000000000
--- a/include/linux/mfd/hi6421-spmi-pmic.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Header file for device driver Hi6421 PMIC
- *
- * Copyright (c) 2013 Linaro Ltd.
- * Copyright (C) 2011 Hisilicon.
- * Copyright (c) 2020-2021 Huawei Technologies Co., Ltd
- *
- * Guodong Xu <guodong.xu@linaro.org>
- */
-
-#ifndef	__HISI_PMIC_H
-#define	__HISI_PMIC_H
-
-#include <linux/irqdomain.h>
-#include <linux/regmap.h>
-
-struct hi6421_spmi_pmic {
-	struct resource				*res;
-	struct device				*dev;
-	void __iomem				*regs;
-	struct regmap				*regmap;
-};
-
-#endif		/* __HISI_PMIC_H */
-- 
cgit v1.2.3


From c1baf6c591e6901d3422d7a0d0d32ccf29883edf Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Sun, 12 Sep 2021 21:17:15 +0300
Subject: usb: phy: tegra: Support OTG mode programming

Support programming USB PHY into OTG mode.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Link: https://lore.kernel.org/r/20210912181718.1328-5-digetx@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/tegra_usb_phy.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h
index fd1c9f6a4e37..d3e65eb9e16f 100644
--- a/include/linux/usb/tegra_usb_phy.h
+++ b/include/linux/usb/tegra_usb_phy.h
@@ -18,6 +18,7 @@
 
 #include <linux/clk.h>
 #include <linux/gpio.h>
+#include <linux/regmap.h>
 #include <linux/reset.h>
 #include <linux/usb/otg.h>
 
@@ -30,6 +31,7 @@
  *      enter host mode
  * requires_extra_tuning_parameters: true if xcvr_hsslew, hssquelch_level
  *      and hsdiscon_level should be set for adequate signal quality
+ * requires_pmc_ao_power_up: true if USB AO is powered down by default
  */
 
 struct tegra_phy_soc_config {
@@ -37,6 +39,7 @@ struct tegra_phy_soc_config {
 	bool has_hostpc;
 	bool requires_usbmode_setup;
 	bool requires_extra_tuning_parameters;
+	bool requires_pmc_ao_power_up;
 };
 
 struct tegra_utmip_config {
@@ -62,6 +65,7 @@ enum tegra_usb_phy_port_speed {
 struct tegra_xtal_freq;
 
 struct tegra_usb_phy {
+	int irq;
 	int instance;
 	const struct tegra_xtal_freq *freq;
 	void __iomem *regs;
@@ -70,6 +74,7 @@ struct tegra_usb_phy {
 	struct clk *pll_u;
 	struct clk *pad_clk;
 	struct regulator *vbus;
+	struct regmap *pmc_regmap;
 	enum usb_dr_mode mode;
 	void *config;
 	const struct tegra_phy_soc_config *soc_config;
-- 
cgit v1.2.3


From 1e4071f6282b3323435b02b1719bcfbfe1b57150 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Fri, 24 Sep 2021 07:12:42 -0500
Subject: ipmi: Export ipmb_checksum()

It will be needed by the upcoming ipmb direct addressing.

Signed-off-by: Corey Minyard <minyard@acm.org>
Tested-by: Andrew Manley <andrew.manley@sealingtech.com>
Reviewed-by: Andrew Manley <andrew.manley@sealingtech.com>
---
 include/linux/ipmi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 52850a02a3d0..163831a087ef 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -335,4 +335,7 @@ extern int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data);
 
 #define GET_DEVICE_ID_MAX_RETRY		5
 
+/* Helper function for computing the IPMB checksum of some data. */
+unsigned char ipmb_checksum(unsigned char *data, int size);
+
 #endif /* __LINUX_IPMI_H */
-- 
cgit v1.2.3


From 059747c245f0e9af5e109eece7d3414dbe08d513 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Fri, 24 Sep 2021 11:42:56 -0500
Subject: ipmi: Add support for IPMB direct messages

An application has come up that has a device sitting right on the IPMB
that would like to communicate with the BMC on the IPMB using normal
IPMI commands.

Sending these commands and handling the responses is easy enough, no
modifications are needed to the IPMI infrastructure.  But if this is an
application that also needs to receive IPMB commands and respond, some
way is needed to handle these incoming commands and send the responses.

Currently, the IPMI message handler only sends commands to the interface
and only receives responses from interface.  This change extends the
interface to receive commands/responses and send commands/responses.
These are formatted differently in support of receiving/sending IPMB
messages directly.

Signed-off-by: Corey Minyard <minyard@acm.org>
Tested-by: Andrew Manley <andrew.manley@sealingtech.com>
Reviewed-by: Andrew Manley <andrew.manley@sealingtech.com>
---
 include/linux/ipmi_smi.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index deec18b8944a..9277d21c2690 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -38,6 +38,59 @@ struct ipmi_smi;
 #define IPMI_WATCH_MASK_CHECK_WATCHDOG	(1 << 1)
 #define IPMI_WATCH_MASK_CHECK_COMMANDS	(1 << 2)
 
+/*
+ * SMI messages
+ *
+ * When communicating with an SMI, messages come in two formats:
+ *
+ * * Normal (to a BMC over a BMC interface)
+ *
+ * * IPMB (over a IPMB to another MC)
+ *
+ * When normal, commands are sent using the format defined by a
+ * standard message over KCS (NetFn must be even):
+ *
+ *   +-----------+-----+------+
+ *   | NetFn/LUN | Cmd | Data |
+ *   +-----------+-----+------+
+ *
+ * And responses, similarly, with an completion code added (NetFn must
+ * be odd):
+ *
+ *   +-----------+-----+------+------+
+ *   | NetFn/LUN | Cmd | CC   | Data |
+ *   +-----------+-----+------+------+
+ *
+ * With normal messages, only commands are sent and only responses are
+ * received.
+ *
+ * In IPMB mode, we are acting as an IPMB device. Commands will be in
+ * the following format (NetFn must be even):
+ *
+ *   +-------------+------+-------------+-----+------+
+ *   | NetFn/rsLUN | Addr | rqSeq/rqLUN | Cmd | Data |
+ *   +-------------+------+-------------+-----+------+
+ *
+ * Responses will using the following format:
+ *
+ *   +-------------+------+-------------+-----+------+------+
+ *   | NetFn/rqLUN | Addr | rqSeq/rsLUN | Cmd | CC   | Data |
+ *   +-------------+------+-------------+-----+------+------+
+ *
+ * This is similar to the format defined in the IPMB manual section
+ * 2.11.1 with the checksums and the first address removed.  Also, the
+ * address is always the remote address.
+ *
+ * IPMB messages can be commands and responses in both directions.
+ * Received commands are handled as received commands from the message
+ * queue.
+ */
+
+enum ipmi_smi_msg_type {
+	IPMI_SMI_MSG_TYPE_NORMAL = 0,
+	IPMI_SMI_MSG_TYPE_IPMB_DIRECT
+};
+
 /*
  * Messages to/from the lower layer.  The smi interface will take one
  * of these to send. After the send has occurred and a response has
@@ -54,6 +107,8 @@ struct ipmi_smi;
 struct ipmi_smi_msg {
 	struct list_head link;
 
+	enum ipmi_smi_msg_type type;
+
 	long    msgid;
 	void    *user_data;
 
@@ -73,6 +128,10 @@ struct ipmi_smi_msg {
 struct ipmi_smi_handlers {
 	struct module *owner;
 
+	/* Capabilities of the SMI. */
+#define IPMI_SMI_CAN_HANDLE_IPMB_DIRECT		(1 << 0)
+	unsigned int flags;
+
 	/*
 	 * The low-level interface cannot start sending messages to
 	 * the upper layer until this function is called.  This may
-- 
cgit v1.2.3


From 549017aa1bb7ec19a1e24e7f65480a1c2e76b90e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 5 Oct 2021 13:52:42 +0200
Subject: netlink: remove netlink_broadcast_filtered

No users in tree since commit a3498436b3a0 ("netns: restrict uevents"),
so remove this functionality.

Cc: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 61b1c7fcc401..1ec631838af9 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -156,10 +156,6 @@ bool netlink_strict_get_check(struct sk_buff *skb);
 int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
 int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
 		      __u32 group, gfp_t allocation);
-int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
-			       __u32 portid, __u32 group, gfp_t allocation,
-			       int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
-			       void *filter_data);
 int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code);
 int netlink_register_notifier(struct notifier_block *nb);
 int netlink_unregister_notifier(struct notifier_block *nb);
-- 
cgit v1.2.3


From ded6e16b37e4c8c86cda98604ecd78818d6ca36a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 4 Oct 2021 12:14:43 -0700
Subject: mlx4: replace mlx4_mac_to_u64() with ether_addr_to_u64()

mlx4_mac_to_u64() predates and opencodes ether_addr_to_u64().
It doesn't make the argument constant so it'll be problematic
when dev->dev_addr becomes a const. Convert to the generic helper.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx4/driver.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index a858bcb6220b..b26b71f62fb4 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -92,18 +92,6 @@ void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int
 
 struct devlink_port *mlx4_get_devlink_port(struct mlx4_dev *dev, int port);
 
-static inline u64 mlx4_mac_to_u64(u8 *addr)
-{
-	u64 mac = 0;
-	int i;
-
-	for (i = 0; i < ETH_ALEN; i++) {
-		mac <<= 8;
-		mac |= addr[i];
-	}
-	return mac;
-}
-
 static inline void mlx4_u64_to_mac(u8 *addr, u64 mac)
 {
 	int i;
-- 
cgit v1.2.3


From 1bb96a07f9a8f2fe9725f6689605e32b75d20508 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 4 Oct 2021 12:14:44 -0700
Subject: mlx4: replace mlx4_u64_to_mac() with u64_to_ether_addr()

mlx4_u64_to_mac() predates the common helper but doesn't
make the argument constant.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx4/driver.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index b26b71f62fb4..1834c8fad12e 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -92,14 +92,4 @@ void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int
 
 struct devlink_port *mlx4_get_devlink_port(struct mlx4_dev *dev, int port);
 
-static inline void mlx4_u64_to_mac(u8 *addr, u64 mac)
-{
-	int i;
-
-	for (i = ETH_ALEN; i > 0; i--) {
-		addr[i - 1] = mac & 0xFF;
-		mac >>= 8;
-	}
-}
-
 #endif /* MLX4_DRIVER_H */
-- 
cgit v1.2.3


From ebb1fdb589bd6d0ee647d4fa285bc934ba369cde Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 4 Oct 2021 12:14:46 -0700
Subject: mlx4: constify args for const dev_addr

netdev->dev_addr will become const soon. Make sure all
functions which pass it around mark appropriate args
as const.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx4/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 30bb59fe970c..6646634a0b9d 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1436,7 +1436,7 @@ int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
 				  enum mlx4_net_trans_rule_id id);
 int mlx4_hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id);
 
-int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, const unsigned char *addr,
 			  int port, int qpn, u16 prio, u64 *reg_id);
 
 void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port,
-- 
cgit v1.2.3


From 027b57170bf8bb6999a28e4a5f3d78bf1db0f90c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
Date: Sat, 2 Oct 2021 15:09:00 +0200
Subject: serial: core: Fix initializing and restoring termios speed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit edc6afc54968 ("tty: switch to ktermios and new framework")
termios speed is no longer stored only in c_cflag member but also in new
additional c_ispeed and c_ospeed members. If BOTHER flag is set in c_cflag
then termios speed is stored only in these new members.

Therefore to correctly restore termios speed it is required to store also
ispeed and ospeed members, not only cflag member.

In case only cflag member with BOTHER flag is restored then functions
tty_termios_baud_rate() and tty_termios_input_baud_rate() returns baudrate
stored in c_ospeed / c_ispeed member, which is zero as it was not restored
too. If reported baudrate is invalid (e.g. zero) then serial core functions
report fallback baudrate value 9600. So it means that in this case original
baudrate is lost and kernel changes it to value 9600.

Simple reproducer of this issue is to boot kernel with following command
line argument: "console=ttyXXX,86400" (where ttyXXX is the device name).
For speed 86400 there is no Bnnn constant and therefore kernel has to
represent this speed via BOTHER c_cflag. Which means that speed is stored
only in c_ospeed and c_ispeed members, not in c_cflag anymore.

If bootloader correctly configures serial device to speed 86400 then kernel
prints boot log to early console at speed speed 86400 without any issue.
But after kernel starts initializing real console device ttyXXX then speed
is changed to fallback value 9600 because information about speed was lost.

This patch fixes above issue by storing and restoring also ispeed and
ospeed members, which are required for BOTHER flag.

Fixes: edc6afc54968 ("[PATCH] tty: switch to ktermios and new framework")
Cc: stable@vger.kernel.org
Signed-off-by: Pali Rohár <pali@kernel.org>
Link: https://lore.kernel.org/r/20211002130900.9518-1-pali@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/console.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 20874db50bc8..a97f277cfdfa 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -149,6 +149,8 @@ struct console {
 	short	flags;
 	short	index;
 	int	cflag;
+	uint	ispeed;
+	uint	ospeed;
 	void	*data;
 	struct	 console *next;
 };
-- 
cgit v1.2.3


From a130e8fbc7de796eb6e680724d87f4737a26d0ac Mon Sep 17 00:00:00 2001
From: Josh Don <joshdon@google.com>
Date: Fri, 27 Aug 2021 09:54:38 -0700
Subject: fs/proc/uptime.c: Fix idle time reporting in /proc/uptime

/proc/uptime reports idle time by reading the CPUTIME_IDLE field from
the per-cpu kcpustats. However, on NO_HZ systems, idle time is not
continually updated on idle cpus, leading this value to appear
incorrectly small.

/proc/stat performs an accounting update when reading idle time; we
can use the same approach for uptime.

With this patch, /proc/stat and /proc/uptime now agree on idle time.
Additionally, the following shows idle time tick up consistently on an
idle machine:

  (while true; do cat /proc/uptime; sleep 1; done) | awk '{print $2-prev; prev=$2}'

Reported-by: Luigi Rizzo <lrizzo@google.com>
Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lkml.kernel.org/r/20210827165438.3280779-1-joshdon@google.com
---
 include/linux/kernel_stat.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 44ae1a7eb9e3..69ae6b278464 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -102,6 +102,7 @@ extern void account_system_index_time(struct task_struct *, u64,
 				      enum cpu_usage_stat);
 extern void account_steal_time(u64);
 extern void account_idle_time(u64);
+extern u64 get_idle_time(struct kernel_cpustat *kcs, int cpu);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static inline void account_process_tick(struct task_struct *tsk, int user)
-- 
cgit v1.2.3


From ceeadb83aea28372e54857bf88ab7e17af48ab7b Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 5 Sep 2021 14:35:41 +0000
Subject: sched: Make struct sched_statistics independent of fair sched class

If we want to use the schedstats facility to trace other sched classes, we
should make it independent of fair sched class. The struct sched_statistics
is the schedular statistics of a task_struct or a task_group. So we can
move it into struct task_struct and struct task_group to achieve the goal.

After the patch, schestats are orgnized as follows,

    struct task_struct {
       ...
       struct sched_entity se;
       struct sched_rt_entity rt;
       struct sched_dl_entity dl;
       ...
       struct sched_statistics stats;
       ...
   };

Regarding the task group, schedstats is only supported for fair group
sched, and a new struct sched_entity_stats is introduced, suggested by
Peter -

    struct sched_entity_stats {
        struct sched_entity     se;
        struct sched_statistics stats;
    } __no_randomize_layout;

Then with the se in a task_group, we can easily get the stats.

The sched_statistics members may be frequently modified when schedstats is
enabled, in order to avoid impacting on random data which may in the same
cacheline with them, the struct sched_statistics is defined as cacheline
aligned.

As this patch changes the core struct of scheduler, so I verified the
performance it may impact on the scheduler with 'perf bench sched
pipe', suggested by Mel. Below is the result, in which all the values
are in usecs/op.
                                  Before               After
      kernel.sched_schedstats=0  5.2~5.4               5.2~5.4
      kernel.sched_schedstats=1  5.3~5.5               5.3~5.5
[These data is a little difference with the earlier version, that is
 because my old test machine is destroyed so I have to use a new
 different test machine.]

Almost no impact on the sched performance.

No functional change.

[lkp@intel.com: reported build failure in earlier version]

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com
---
 include/linux/sched.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c1a927ddec64..2bc4c72fec2d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -522,7 +522,7 @@ struct sched_statistics {
 	u64				nr_wakeups_passive;
 	u64				nr_wakeups_idle;
 #endif
-};
+} ____cacheline_aligned;
 
 struct sched_entity {
 	/* For load-balancing: */
@@ -538,8 +538,6 @@ struct sched_entity {
 
 	u64				nr_migrations;
 
-	struct sched_statistics		statistics;
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	int				depth;
 	struct sched_entity		*parent;
@@ -803,6 +801,8 @@ struct task_struct {
 	struct uclamp_se		uclamp[UCLAMP_CNT];
 #endif
 
+	struct sched_statistics         stats;
+
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* List of struct preempt_notifier: */
 	struct hlist_head		preempt_notifiers;
-- 
cgit v1.2.3


From 847fc0cd0664fcb2a08ac66df6b85935361ec454 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 5 Sep 2021 14:35:43 +0000
Subject: sched: Introduce task block time in schedstats

Currently in schedstats we have sum_sleep_runtime and iowait_sum, but
there's no metric to show how long the task is in D state.  Once a task in
D state, it means the task is blocked in the kernel, for example the
task may be waiting for a mutex. The D state is more frequent than
iowait, and it is more critital than S state. So it is worth to add a
metric to measure it.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210905143547.4668-5-laoar.shao@gmail.com
---
 include/linux/sched.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2bc4c72fec2d..193e16e2d0e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -503,6 +503,8 @@ struct sched_statistics {
 
 	u64				block_start;
 	u64				block_max;
+	s64				sum_block_runtime;
+
 	u64				exec_max;
 	u64				slice_max;
 
-- 
cgit v1.2.3


From 8d491de6edc27138806cae6e8eca455beb325b62 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2021 14:24:32 +0200
Subject: sched: Move mmdrop to RCU on RT

mmdrop() is invoked from finish_task_switch() by the incoming task to drop
the mm which was handed over by the previous task. mmdrop() can be quite
expensive which prevents an incoming real-time task from getting useful
work done.

Provide mmdrop_sched() which maps to mmdrop() on !RT kernels. On RT kernels
it delagates the eventually required invocation of __mmdrop() to RCU.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210928122411.648582026@linutronix.de
---
 include/linux/mm_types.h |  4 ++++
 include/linux/sched/mm.h | 29 +++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f8ee09c711f..e9672de22cf2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
 #include <linux/completion.h>
 #include <linux/cpumask.h>
 #include <linux/uprobes.h>
+#include <linux/rcupdate.h>
 #include <linux/page-flags-layout.h>
 #include <linux/workqueue.h>
 #include <linux/seqlock.h>
@@ -572,6 +573,9 @@ struct mm_struct {
 		bool tlb_flush_batched;
 #endif
 		struct uprobes_state uprobes_state;
+#ifdef CONFIG_PREEMPT_RT
+		struct rcu_head delayed_drop;
+#endif
 #ifdef CONFIG_HUGETLB_PAGE
 		atomic_long_t hugetlb_usage;
 #endif
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 5561486fddef..aca874d33fe6 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,35 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is
+ * by far the least expensive way to do that.
+ */
+static inline void __mmdrop_delayed(struct rcu_head *rhp)
+{
+	struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
+
+	__mmdrop(mm);
+}
+
+/*
+ * Invoked from finish_task_switch(). Delegates the heavy lifting on RT
+ * kernels via RCU.
+ */
+static inline void mmdrop_sched(struct mm_struct *mm)
+{
+	/* Provides a full memory barrier. See mmdrop() */
+	if (atomic_dec_and_test(&mm->mm_count))
+		call_rcu(&mm->delayed_drop, __mmdrop_delayed);
+}
+#else
+static inline void mmdrop_sched(struct mm_struct *mm)
+{
+	mmdrop(mm);
+}
+#endif
+
 /**
  * mmget() - Pin the address space associated with a &struct mm_struct.
  * @mm: The address space to pin.
-- 
cgit v1.2.3


From c8ed99533dbc0fcc1142671ec80acb33045d2999 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vincent.donnefort@arm.com>
Date: Wed, 8 Sep 2021 15:05:23 +0100
Subject: PM: EM: Mark inefficient states

Some SoCs, such as the sd855 have OPPs within the same performance domain,
whose cost is higher than others with a higher frequency. Even though
those OPPs are interesting from a cooling perspective, it makes no sense
to use them when the device can run at full capacity. Those OPPs handicap
the performance domain, when choosing the most energy-efficient CPU and
are wasting energy. They are inefficient.

Hence, add support for such OPPs to the Energy Model. The table can now
be read skipping inefficient performance states (and by extension,
inefficient OPPs).

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 39dcadd492b5..3641ca4acf04 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -17,13 +17,25 @@
  *		device). It can be a total power: static and dynamic.
  * @cost:	The cost coefficient associated with this level, used during
  *		energy calculation. Equal to: power * max_frequency / frequency
+ * @flags:	see "em_perf_state flags" description below.
  */
 struct em_perf_state {
 	unsigned long frequency;
 	unsigned long power;
 	unsigned long cost;
+	unsigned long flags;
 };
 
+/*
+ * em_perf_state flags:
+ *
+ * EM_PERF_STATE_INEFFICIENT: The performance state is inefficient. There is
+ * in this em_perf_domain, another performance state with a higher frequency
+ * but a lower or equal power cost. Such inefficient states are ignored when
+ * using em_pd_get_efficient_*() functions.
+ */
+#define EM_PERF_STATE_INEFFICIENT BIT(0)
+
 /**
  * struct em_perf_domain - Performance domain
  * @table:		List of performance states, in ascending order
-- 
cgit v1.2.3


From 88f7a89560f6d0fc7803a8933637488f14e0a098 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vincent.donnefort@arm.com>
Date: Wed, 8 Sep 2021 15:05:24 +0100
Subject: PM: EM: Extend em_perf_domain with a flag field

Merge the current "milliwatts" option into a "flag" field. This intends to
prepare the extension of this structure for inefficient states support in
the Energy Model.

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 3641ca4acf04..671440371a95 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -40,8 +40,7 @@ struct em_perf_state {
  * struct em_perf_domain - Performance domain
  * @table:		List of performance states, in ascending order
  * @nr_perf_states:	Number of performance states
- * @milliwatts:		Flag indicating the power values are in milli-Watts
- *			or some other scale.
+ * @flags:		See "em_perf_domain flags"
  * @cpus:		Cpumask covering the CPUs of the domain. It's here
  *			for performance reasons to avoid potential cache
  *			misses during energy calculations in the scheduler
@@ -56,10 +55,18 @@ struct em_perf_state {
 struct em_perf_domain {
 	struct em_perf_state *table;
 	int nr_perf_states;
-	int milliwatts;
+	unsigned long flags;
 	unsigned long cpus[];
 };
 
+/*
+ *  em_perf_domain flags:
+ *
+ *  EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some
+ *  other scale.
+ */
+#define EM_PERF_DOMAIN_MILLIWATTS BIT(0)
+
 #define em_span_cpus(em) (to_cpumask((em)->cpus))
 
 #ifdef CONFIG_ENERGY_MODEL
-- 
cgit v1.2.3


From 8354eb9eb3ddb4a8d0857648a470beffcc9d8639 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vincent.donnefort@arm.com>
Date: Wed, 8 Sep 2021 15:05:25 +0100
Subject: PM: EM: Allow skipping inefficient states

The new performance domain flag EM_PERF_DOMAIN_SKIP_INEFFICIENCIES allows
to not take into account inefficient states when estimating energy
consumption. This intends to let the Energy Model know that CPUFreq itself
will skip inefficiencies and such states don't need to be part of the
estimation anymore.

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 671440371a95..6377adc3b78d 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -64,8 +64,12 @@ struct em_perf_domain {
  *
  *  EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some
  *  other scale.
+ *
+ *  EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating
+ *  energy consumption.
  */
 #define EM_PERF_DOMAIN_MILLIWATTS BIT(0)
+#define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1)
 
 #define em_span_cpus(em) (to_cpumask((em)->cpus))
 
@@ -120,6 +124,37 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				bool milliwatts);
 void em_dev_unregister_perf_domain(struct device *dev);
 
+/**
+ * em_pd_get_efficient_state() - Get an efficient performance state from the EM
+ * @pd   : Performance domain for which we want an efficient frequency
+ * @freq : Frequency to map with the EM
+ *
+ * It is called from the scheduler code quite frequently and as a consequence
+ * doesn't implement any check.
+ *
+ * Return: An efficient performance state, high enough to meet @freq
+ * requirement.
+ */
+static inline
+struct em_perf_state *em_pd_get_efficient_state(struct em_perf_domain *pd,
+						unsigned long freq)
+{
+	struct em_perf_state *ps;
+	int i;
+
+	for (i = 0; i < pd->nr_perf_states; i++) {
+		ps = &pd->table[i];
+		if (ps->frequency >= freq) {
+			if (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES &&
+			    ps->flags & EM_PERF_STATE_INEFFICIENT)
+				continue;
+			break;
+		}
+	}
+
+	return ps;
+}
+
 /**
  * em_cpu_energy() - Estimates the energy consumed by the CPUs of a
  *		performance domain
@@ -142,7 +177,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 {
 	unsigned long freq, scale_cpu;
 	struct em_perf_state *ps;
-	int i, cpu;
+	int cpu;
 
 	if (!sum_util)
 		return 0;
@@ -167,11 +202,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	 * Find the lowest performance state of the Energy Model above the
 	 * requested frequency.
 	 */
-	for (i = 0; i < pd->nr_perf_states; i++) {
-		ps = &pd->table[i];
-		if (ps->frequency >= freq)
-			break;
-	}
+	ps = em_pd_get_efficient_state(pd, freq);
 
 	/*
 	 * The capacity of a CPU in the domain at the performance state (ps)
-- 
cgit v1.2.3


From 442d24a5c49a8ed1008dcb9c06dc463d12421e7f Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vincent.donnefort@arm.com>
Date: Wed, 8 Sep 2021 15:05:27 +0100
Subject: cpufreq: Add an interface to mark inefficient frequencies

Some SoCs such as the sd855 have OPPs within the same policy whose cost is
higher than others with a higher frequency. Those OPPs are inefficients
and it might be interesting for a governor to not use them.

cpufreq_table_set_inefficient() allows the caller to identify a specified
frequency as being inefficient. Inefficient frequencies are only supported
on sorted tables.

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index ff88bb3e44fc..16997e1ff3fe 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -660,10 +660,11 @@ struct governor_attr {
  *********************************************************************/
 
 /* Special Values of .frequency field */
-#define CPUFREQ_ENTRY_INVALID	~0u
-#define CPUFREQ_TABLE_END	~1u
+#define CPUFREQ_ENTRY_INVALID		~0u
+#define CPUFREQ_TABLE_END		~1u
 /* Special Values of .flags field */
-#define CPUFREQ_BOOST_FREQ	(1 << 0)
+#define CPUFREQ_BOOST_FREQ		(1 << 0)
+#define CPUFREQ_INEFFICIENT_FREQ	(1 << 1)
 
 struct cpufreq_frequency_table {
 	unsigned int	flags;
@@ -1003,6 +1004,36 @@ static inline int cpufreq_table_count_valid_entries(const struct cpufreq_policy
 	return count;
 }
 
+/**
+ * cpufreq_table_set_inefficient() - Mark a frequency as inefficient
+ * @policy:	the &struct cpufreq_policy containing the inefficient frequency
+ * @frequency:	the inefficient frequency
+ *
+ * The &struct cpufreq_policy must use a sorted frequency table
+ *
+ * Return:	%0 on success or a negative errno code
+ */
+
+static inline int
+cpufreq_table_set_inefficient(struct cpufreq_policy *policy,
+			      unsigned int frequency)
+{
+	struct cpufreq_frequency_table *pos;
+
+	/* Not supported */
+	if (policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED)
+		return -EINVAL;
+
+	cpufreq_for_each_valid_entry(pos, policy->freq_table) {
+		if (pos->frequency == frequency) {
+			pos->flags |= CPUFREQ_INEFFICIENT_FREQ;
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
 static inline int parse_perf_domain(int cpu, const char *list_name,
 				    const char *cell_name)
 {
@@ -1071,6 +1102,13 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
 	return false;
 }
 
+static inline int
+cpufreq_table_set_inefficient(struct cpufreq_policy *policy,
+			      unsigned int frequency)
+{
+	return -EINVAL;
+}
+
 static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name,
 						     const char *cell_name, struct cpumask *cpumask)
 {
-- 
cgit v1.2.3


From 1f39fa0dccff71d4788089b5e617229b19166867 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vincent.donnefort@arm.com>
Date: Wed, 8 Sep 2021 15:05:28 +0100
Subject: cpufreq: Introducing CPUFREQ_RELATION_E

This newly introduced flag can be applied by a governor to a CPUFreq
relation, when looking for a frequency within the policy table. The
resolution would then only walk through efficient frequencies.

Even with the flag set, the policy max limit will still be honoured. If no
efficient frequencies can be found within the limits of the policy, an
inefficient one would be returned.

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 111 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 86 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 16997e1ff3fe..6d96bd452d55 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -118,6 +118,13 @@ struct cpufreq_policy {
 	 */
 	bool			strict_target;
 
+	/*
+	 * Set if inefficient frequencies were found in the frequency table.
+	 * This indicates if the relation flag CPUFREQ_RELATION_E can be
+	 * honored.
+	 */
+	bool			efficiencies_available;
+
 	/*
 	 * Preferred average time interval between consecutive invocations of
 	 * the driver to set the frequency for this policy.  To be set by the
@@ -273,6 +280,8 @@ static inline void cpufreq_stats_record_transition(struct cpufreq_policy *policy
 #define CPUFREQ_RELATION_L 0  /* lowest frequency at or above target */
 #define CPUFREQ_RELATION_H 1  /* highest frequency below or at target */
 #define CPUFREQ_RELATION_C 2  /* closest frequency to target */
+/* relation flags */
+#define CPUFREQ_RELATION_E BIT(2) /* Get if possible an efficient frequency */
 
 struct freq_attr {
 	struct attribute attr;
@@ -741,6 +750,22 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
 			continue;					\
 		else
 
+/**
+ * cpufreq_for_each_efficient_entry_idx - iterate with index over a cpufreq
+ *	frequency_table excluding CPUFREQ_ENTRY_INVALID and
+ *	CPUFREQ_INEFFICIENT_FREQ frequencies.
+ * @pos: the &struct cpufreq_frequency_table to use as a loop cursor.
+ * @table: the &struct cpufreq_frequency_table to iterate over.
+ * @idx: the table entry currently being processed.
+ * @efficiencies: set to true to only iterate over efficient frequencies.
+ */
+
+#define cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies)	\
+	cpufreq_for_each_valid_entry_idx(pos, table, idx)			\
+		if (efficiencies && (pos->flags & CPUFREQ_INEFFICIENT_FREQ))	\
+			continue;						\
+		else
+
 
 int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
 				    struct cpufreq_frequency_table *table);
@@ -765,14 +790,15 @@ bool policy_has_boost_freq(struct cpufreq_policy *policy);
 
 /* Find lowest freq at or above target in a table in ascending order */
 static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy,
-					      unsigned int target_freq)
+					      unsigned int target_freq,
+					      bool efficiencies)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
 	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
 	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
+	cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) {
 		freq = pos->frequency;
 
 		if (freq >= target_freq)
@@ -786,14 +812,15 @@ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy,
 
 /* Find lowest freq at or above target in a table in descending order */
 static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy,
-					      unsigned int target_freq)
+					      unsigned int target_freq,
+					      bool efficiencies)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
 	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
 	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
+	cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) {
 		freq = pos->frequency;
 
 		if (freq == target_freq)
@@ -816,26 +843,30 @@ static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy,
 
 /* Works only on sorted freq-tables */
 static inline int cpufreq_table_find_index_l(struct cpufreq_policy *policy,
-					     unsigned int target_freq)
+					     unsigned int target_freq,
+					     bool efficiencies)
 {
 	target_freq = clamp_val(target_freq, policy->min, policy->max);
 
 	if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING)
-		return cpufreq_table_find_index_al(policy, target_freq);
+		return cpufreq_table_find_index_al(policy, target_freq,
+						   efficiencies);
 	else
-		return cpufreq_table_find_index_dl(policy, target_freq);
+		return cpufreq_table_find_index_dl(policy, target_freq,
+						   efficiencies);
 }
 
 /* Find highest freq at or below target in a table in ascending order */
 static inline int cpufreq_table_find_index_ah(struct cpufreq_policy *policy,
-					      unsigned int target_freq)
+					      unsigned int target_freq,
+					      bool efficiencies)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
 	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
 	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
+	cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) {
 		freq = pos->frequency;
 
 		if (freq == target_freq)
@@ -858,14 +889,15 @@ static inline int cpufreq_table_find_index_ah(struct cpufreq_policy *policy,
 
 /* Find highest freq at or below target in a table in descending order */
 static inline int cpufreq_table_find_index_dh(struct cpufreq_policy *policy,
-					      unsigned int target_freq)
+					      unsigned int target_freq,
+					      bool efficiencies)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
 	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
 	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
+	cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) {
 		freq = pos->frequency;
 
 		if (freq <= target_freq)
@@ -879,26 +911,30 @@ static inline int cpufreq_table_find_index_dh(struct cpufreq_policy *policy,
 
 /* Works only on sorted freq-tables */
 static inline int cpufreq_table_find_index_h(struct cpufreq_policy *policy,
-					     unsigned int target_freq)
+					     unsigned int target_freq,
+					     bool efficiencies)
 {
 	target_freq = clamp_val(target_freq, policy->min, policy->max);
 
 	if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING)
-		return cpufreq_table_find_index_ah(policy, target_freq);
+		return cpufreq_table_find_index_ah(policy, target_freq,
+						   efficiencies);
 	else
-		return cpufreq_table_find_index_dh(policy, target_freq);
+		return cpufreq_table_find_index_dh(policy, target_freq,
+						   efficiencies);
 }
 
 /* Find closest freq to target in a table in ascending order */
 static inline int cpufreq_table_find_index_ac(struct cpufreq_policy *policy,
-					      unsigned int target_freq)
+					      unsigned int target_freq,
+					      bool efficiencies)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
 	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
 	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
+	cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) {
 		freq = pos->frequency;
 
 		if (freq == target_freq)
@@ -925,14 +961,15 @@ static inline int cpufreq_table_find_index_ac(struct cpufreq_policy *policy,
 
 /* Find closest freq to target in a table in descending order */
 static inline int cpufreq_table_find_index_dc(struct cpufreq_policy *policy,
-					      unsigned int target_freq)
+					      unsigned int target_freq,
+					      bool efficiencies)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
 	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
 	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
+	cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) {
 		freq = pos->frequency;
 
 		if (freq == target_freq)
@@ -959,35 +996,58 @@ static inline int cpufreq_table_find_index_dc(struct cpufreq_policy *policy,
 
 /* Works only on sorted freq-tables */
 static inline int cpufreq_table_find_index_c(struct cpufreq_policy *policy,
-					     unsigned int target_freq)
+					     unsigned int target_freq,
+					     bool efficiencies)
 {
 	target_freq = clamp_val(target_freq, policy->min, policy->max);
 
 	if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING)
-		return cpufreq_table_find_index_ac(policy, target_freq);
+		return cpufreq_table_find_index_ac(policy, target_freq,
+						   efficiencies);
 	else
-		return cpufreq_table_find_index_dc(policy, target_freq);
+		return cpufreq_table_find_index_dc(policy, target_freq,
+						   efficiencies);
 }
 
 static inline int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
 						 unsigned int target_freq,
 						 unsigned int relation)
 {
+	bool efficiencies = policy->efficiencies_available &&
+			    (relation & CPUFREQ_RELATION_E);
+	int idx;
+
+	/* cpufreq_table_index_unsorted() has no use for this flag anyway */
+	relation &= ~CPUFREQ_RELATION_E;
+
 	if (unlikely(policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED))
 		return cpufreq_table_index_unsorted(policy, target_freq,
 						    relation);
-
+retry:
 	switch (relation) {
 	case CPUFREQ_RELATION_L:
-		return cpufreq_table_find_index_l(policy, target_freq);
+		idx = cpufreq_table_find_index_l(policy, target_freq,
+						 efficiencies);
+		break;
 	case CPUFREQ_RELATION_H:
-		return cpufreq_table_find_index_h(policy, target_freq);
+		idx = cpufreq_table_find_index_h(policy, target_freq,
+						 efficiencies);
+		break;
 	case CPUFREQ_RELATION_C:
-		return cpufreq_table_find_index_c(policy, target_freq);
+		idx = cpufreq_table_find_index_c(policy, target_freq,
+						 efficiencies);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
 	}
+
+	if (idx < 0 && efficiencies) {
+		efficiencies = false;
+		goto retry;
+	}
+
+	return idx;
 }
 
 static inline int cpufreq_table_count_valid_entries(const struct cpufreq_policy *policy)
@@ -1027,6 +1087,7 @@ cpufreq_table_set_inefficient(struct cpufreq_policy *policy,
 	cpufreq_for_each_valid_entry(pos, policy->freq_table) {
 		if (pos->frequency == frequency) {
 			pos->flags |= CPUFREQ_INEFFICIENT_FREQ;
+			policy->efficiencies_available = true;
 			return 0;
 		}
 	}
-- 
cgit v1.2.3


From b894d20e6867f04827c7817fbc155460ff108f6f Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vincent.donnefort@arm.com>
Date: Wed, 8 Sep 2021 15:05:29 +0100
Subject: cpufreq: Use CPUFREQ_RELATION_E in DVFS governors

Let the governors schedutil, conservative and ondemand to work, if possible
on efficient frequencies only.

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 6d96bd452d55..7ce71c7371fb 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -283,6 +283,10 @@ static inline void cpufreq_stats_record_transition(struct cpufreq_policy *policy
 /* relation flags */
 #define CPUFREQ_RELATION_E BIT(2) /* Get if possible an efficient frequency */
 
+#define CPUFREQ_RELATION_LE (CPUFREQ_RELATION_L | CPUFREQ_RELATION_E)
+#define CPUFREQ_RELATION_HE (CPUFREQ_RELATION_H | CPUFREQ_RELATION_E)
+#define CPUFREQ_RELATION_CE (CPUFREQ_RELATION_C | CPUFREQ_RELATION_E)
+
 struct freq_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct cpufreq_policy *, char *);
@@ -636,9 +640,11 @@ struct cpufreq_governor *cpufreq_fallback_governor(void);
 static inline void cpufreq_policy_apply_limits(struct cpufreq_policy *policy)
 {
 	if (policy->max < policy->cur)
-		__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
+		__cpufreq_driver_target(policy, policy->max,
+					CPUFREQ_RELATION_HE);
 	else if (policy->min > policy->cur)
-		__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
+		__cpufreq_driver_target(policy, policy->min,
+					CPUFREQ_RELATION_LE);
 }
 
 /* Governor attribute set */
-- 
cgit v1.2.3


From 4ad81f6ef89b62434f1d2ed26e9bec9d0e3d9dfe Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 4 Oct 2021 22:06:41 +0200
Subject: clk: tegra: Add stubs needed for compile testing

These stubs are needed to allow the tegra-cpuidle driver to be
compile-tested.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/linux/clk/tegra.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index d128ad1570aa..3650e926e93f 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -42,6 +42,7 @@ struct tegra_cpu_car_ops {
 #endif
 };
 
+#ifdef CONFIG_ARCH_TEGRA
 extern struct tegra_cpu_car_ops *tegra_cpu_car_ops;
 
 static inline void tegra_wait_cpu_in_reset(u32 cpu)
@@ -83,8 +84,29 @@ static inline void tegra_disable_cpu_clock(u32 cpu)
 
 	tegra_cpu_car_ops->disable_clock(cpu);
 }
+#else
+static inline void tegra_wait_cpu_in_reset(u32 cpu)
+{
+}
 
-#ifdef CONFIG_PM_SLEEP
+static inline void tegra_put_cpu_in_reset(u32 cpu)
+{
+}
+
+static inline void tegra_cpu_out_of_reset(u32 cpu)
+{
+}
+
+static inline void tegra_enable_cpu_clock(u32 cpu)
+{
+}
+
+static inline void tegra_disable_cpu_clock(u32 cpu)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_TEGRA) && defined(CONFIG_PM_SLEEP)
 static inline bool tegra_cpu_rail_off_ready(void)
 {
 	if (WARN_ON(!tegra_cpu_car_ops->rail_off_ready))
-- 
cgit v1.2.3


From 2357672c54c3f748f675446f8eba8b0432b1e7e2 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 2 Oct 2021 06:47:49 +0530
Subject: bpf: Introduce BPF support for kernel module function calls

This change adds support on the kernel side to allow for BPF programs to
call kernel module functions. Userspace will prepare an array of module
BTF fds that is passed in during BPF_PROG_LOAD using fd_array parameter.
In the kernel, the module BTFs are placed in the auxilliary struct for
bpf_prog, and loaded as needed.

The verifier then uses insn->off to index into the fd_array. insn->off
0 is reserved for vmlinux BTF (for backwards compat), so userspace must
use an fd_array index > 0 for module kfunc support. kfunc_btf_tab is
sorted based on offset in an array, and each offset corresponds to one
descriptor, with a max limit up to 256 such module BTFs.

We also change existing kfunc_tab to distinguish each element based on
imm, off pair as each such call will now be distinct.

Another change is to check_kfunc_call callback, which now include a
struct module * pointer, this is to be used in later patch such that the
kfunc_id and module pointer are matched for dynamically registered BTF
sets from loadable modules, so that same kfunc_id in two modules doesn't
lead to check_kfunc_call succeeding. For the duration of the
check_kfunc_call, the reference to struct module exists, as it returns
the pointer stored in kfunc_btf_tab.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211002011757.311265-2-memxor@gmail.com
---
 include/linux/bpf.h          | 8 +++++---
 include/linux/bpf_verifier.h | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1c7fd7c4c6d3..d604c8251d88 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -513,7 +513,7 @@ struct bpf_verifier_ops {
 				 const struct btf_type *t, int off, int size,
 				 enum bpf_access_type atype,
 				 u32 *next_btf_id);
-	bool (*check_kfunc_call)(u32 kfunc_btf_id);
+	bool (*check_kfunc_call)(u32 kfunc_btf_id, struct module *owner);
 };
 
 struct bpf_prog_offload_ops {
@@ -877,6 +877,7 @@ struct bpf_prog_aux {
 	void *jit_data; /* JIT specific data. arch dependent */
 	struct bpf_jit_poke_descriptor *poke_tab;
 	struct bpf_kfunc_desc_tab *kfunc_tab;
+	struct bpf_kfunc_btf_tab *kfunc_btf_tab;
 	u32 size_poke_tab;
 	struct bpf_ksym ksym;
 	const struct bpf_prog_ops *ops;
@@ -1639,7 +1640,7 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
 				const union bpf_attr *kattr,
 				union bpf_attr __user *uattr);
-bool bpf_prog_test_check_kfunc_call(u32 kfunc_id);
+bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner);
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info);
@@ -1860,7 +1861,8 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
 	return -ENOTSUPP;
 }
 
-static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id)
+static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id,
+						  struct module *owner)
 {
 	return false;
 }
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5424124dbe36..c8a78e830fca 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -527,5 +527,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    const struct bpf_prog *tgt_prog,
 			    u32 btf_id,
 			    struct bpf_attach_target_info *tgt_info);
+void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab);
+
 
 #endif /* _LINUX_BPF_VERIFIER_H */
-- 
cgit v1.2.3


From 14f267d95fe4b08831a022c8e15a2eb8991edbf6 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 2 Oct 2021 06:47:51 +0530
Subject: bpf: btf: Introduce helpers for dynamic BTF set registration

This adds helpers for registering btf_id_set from modules and the
bpf_check_mod_kfunc_call callback that can be used to look them up.

With in kernel sets, the way this is supposed to work is, in kernel
callback looks up within the in-kernel kfunc whitelist, and then defers
to the dynamic BTF set lookup if it doesn't find the BTF id. If there is
no in-kernel BTF id set, this callback can be used directly.

Also fix includes for btf.h and bpfptr.h so that they can included in
isolation. This is in preparation for their usage in tcp_bbr, tcp_cubic
and tcp_dctcp modules in the next patch.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211002011757.311265-4-memxor@gmail.com
---
 include/linux/bpfptr.h |  1 +
 include/linux/btf.h    | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h
index 546e27fc6d46..46e1757d06a3 100644
--- a/include/linux/bpfptr.h
+++ b/include/linux/bpfptr.h
@@ -3,6 +3,7 @@
 #ifndef _LINUX_BPFPTR_H
 #define _LINUX_BPFPTR_H
 
+#include <linux/mm.h>
 #include <linux/sockptr.h>
 
 typedef sockptr_t bpfptr_t;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 214fde93214b..6c4c61d821d7 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -5,6 +5,7 @@
 #define _LINUX_BTF_H 1
 
 #include <linux/types.h>
+#include <linux/bpfptr.h>
 #include <uapi/linux/btf.h>
 #include <uapi/linux/bpf.h>
 
@@ -238,4 +239,39 @@ static inline const char *btf_name_by_offset(const struct btf *btf,
 }
 #endif
 
+struct kfunc_btf_id_set {
+	struct list_head list;
+	struct btf_id_set *set;
+	struct module *owner;
+};
+
+struct kfunc_btf_id_list;
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
+			       struct kfunc_btf_id_set *s);
+void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
+				 struct kfunc_btf_id_set *s);
+bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
+			      struct module *owner);
+#else
+static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
+					     struct kfunc_btf_id_set *s)
+{
+}
+static inline void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
+					       struct kfunc_btf_id_set *s)
+{
+}
+static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist,
+					    u32 kfunc_id, struct module *owner)
+{
+	return false;
+}
+#endif
+
+#define DEFINE_KFUNC_BTF_ID_SET(set, name)                                     \
+	struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set),     \
+					 THIS_MODULE }
+
 #endif
-- 
cgit v1.2.3


From 0e32dfc80bae53b05e9eda7eaf259f30ab9ba43a Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 2 Oct 2021 06:47:53 +0530
Subject: bpf: Enable TCP congestion control kfunc from modules

This commit moves BTF ID lookup into the newly added registration
helper, in a way that the bbr, cubic, and dctcp implementation set up
their sets in the bpf_tcp_ca kfunc_btf_set list, while the ones not
dependent on modules are looked up from the wrapper function.

This lifts the restriction for them to be compiled as built in objects,
and can be loaded as modules if required. Also modify Makefile.modfinal
to call resolve_btfids for each module.

Note that since kernel kfunc_ids never overlap with module kfunc_ids, we
only match the owner for module btf id sets.

See following commits for background on use of:

 CONFIG_X86 ifdef:
 569c484f9995 (bpf: Limit static tcp-cc functions in the .BTF_ids list to x86)

 CONFIG_DYNAMIC_FTRACE ifdef:
 7aae231ac93b (bpf: tcp: Limit calling some tcp cc functions to CONFIG_DYNAMIC_FTRACE)

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211002011757.311265-6-memxor@gmail.com
---
 include/linux/btf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 6c4c61d821d7..1d56cd2bb362 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -274,4 +274,6 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist,
 	struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set),     \
 					 THIS_MODULE }
 
+extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list;
+
 #endif
-- 
cgit v1.2.3


From c48e51c8b07aba8a18125221cb67a40cb1256bf2 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 2 Oct 2021 06:47:57 +0530
Subject: bpf: selftests: Add selftests for module kfunc support

This adds selftests that tests the success and failure path for modules
kfuncs (in presence of invalid kfunc calls) for both libbpf and
gen_loader. It also adds a prog_test kfunc_btf_id_list so that we can
add module BTF ID set from bpf_testmod.

This also introduces  a couple of test cases to verifier selftests for
validating whether we get an error or not depending on if invalid kfunc
call remains after elimination of unreachable instructions.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211002011757.311265-10-memxor@gmail.com
---
 include/linux/btf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 1d56cd2bb362..203eef993d76 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -275,5 +275,6 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist,
 					 THIS_MODULE }
 
 extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list;
+extern struct kfunc_btf_id_list prog_test_kfunc_list;
 
 #endif
-- 
cgit v1.2.3


From c921ff373b469ad7907cde219fa700909f59cac4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Tue, 15 Jun 2021 15:10:03 +0200
Subject: dma-buf: add dma_resv_for_each_fence_unlocked v8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Abstract the complexity of iterating over all the fences
in a dma_resv object.

The new loop handles the whole RCU and retry dance and
returns only fences where we can be sure we grabbed the
right one.

v2: fix accessing the shared fences while they might be freed,
    improve kerneldoc, rename _cursor to _iter, add
    dma_resv_iter_is_exclusive, add dma_resv_iter_begin/end

v3: restructor the code, move rcu_read_lock()/unlock() into the
    iterator, add dma_resv_iter_is_restarted()

v4: fix NULL deref when no explicit fence exists, drop superflous
    rcu_read_lock()/unlock() calls.

v5: fix typos in the documentation

v6: fix coding error when excl fence is NULL

v7: one more logic fix

v8: fix index check in dma_resv_iter_is_exclusive()

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> (v7)
Link: https://patchwork.freedesktop.org/patch/msgid/20211005113742.1101-2-christian.koenig@amd.com
---
 include/linux/dma-resv.h | 95 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index 9100dd3dc21f..764138ad8583 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -149,6 +149,101 @@ struct dma_resv {
 	struct dma_resv_list __rcu *fence;
 };
 
+/**
+ * struct dma_resv_iter - current position into the dma_resv fences
+ *
+ * Don't touch this directly in the driver, use the accessor function instead.
+ */
+struct dma_resv_iter {
+	/** @obj: The dma_resv object we iterate over */
+	struct dma_resv *obj;
+
+	/** @all_fences: If all fences should be returned */
+	bool all_fences;
+
+	/** @fence: the currently handled fence */
+	struct dma_fence *fence;
+
+	/** @seq: sequence number to check for modifications */
+	unsigned int seq;
+
+	/** @index: index into the shared fences */
+	unsigned int index;
+
+	/** @fences: the shared fences */
+	struct dma_resv_list *fences;
+
+	/** @is_restarted: true if this is the first returned fence */
+	bool is_restarted;
+};
+
+struct dma_fence *dma_resv_iter_first_unlocked(struct dma_resv_iter *cursor);
+struct dma_fence *dma_resv_iter_next_unlocked(struct dma_resv_iter *cursor);
+
+/**
+ * dma_resv_iter_begin - initialize a dma_resv_iter object
+ * @cursor: The dma_resv_iter object to initialize
+ * @obj: The dma_resv object which we want to iterate over
+ * @all_fences: If all fences should be returned or just the exclusive one
+ */
+static inline void dma_resv_iter_begin(struct dma_resv_iter *cursor,
+				       struct dma_resv *obj,
+				       bool all_fences)
+{
+	cursor->obj = obj;
+	cursor->all_fences = all_fences;
+	cursor->fence = NULL;
+}
+
+/**
+ * dma_resv_iter_end - cleanup a dma_resv_iter object
+ * @cursor: the dma_resv_iter object which should be cleaned up
+ *
+ * Make sure that the reference to the fence in the cursor is properly
+ * dropped.
+ */
+static inline void dma_resv_iter_end(struct dma_resv_iter *cursor)
+{
+	dma_fence_put(cursor->fence);
+}
+
+/**
+ * dma_resv_iter_is_exclusive - test if the current fence is the exclusive one
+ * @cursor: the cursor of the current position
+ *
+ * Returns true if the currently returned fence is the exclusive one.
+ */
+static inline bool dma_resv_iter_is_exclusive(struct dma_resv_iter *cursor)
+{
+	return cursor->index == 0;
+}
+
+/**
+ * dma_resv_iter_is_restarted - test if this is the first fence after a restart
+ * @cursor: the cursor with the current position
+ *
+ * Return true if this is the first fence in an iteration after a restart.
+ */
+static inline bool dma_resv_iter_is_restarted(struct dma_resv_iter *cursor)
+{
+	return cursor->is_restarted;
+}
+
+/**
+ * dma_resv_for_each_fence_unlocked - unlocked fence iterator
+ * @cursor: a struct dma_resv_iter pointer
+ * @fence: the current fence
+ *
+ * Iterate over the fences in a struct dma_resv object without holding the
+ * &dma_resv.lock and using RCU instead. The cursor needs to be initialized
+ * with dma_resv_iter_begin() and cleaned up with dma_resv_iter_end(). Inside
+ * the iterator a reference to the dma_fence is held and the RCU lock dropped.
+ * When the dma_resv is modified the iteration starts over again.
+ */
+#define dma_resv_for_each_fence_unlocked(cursor, fence)			\
+	for (fence = dma_resv_iter_first_unlocked(cursor);		\
+	     fence; fence = dma_resv_iter_next_unlocked(cursor))
+
 #define dma_resv_held(obj) lockdep_is_held(&(obj)->lock.base)
 #define dma_resv_assert_held(obj) lockdep_assert_held(&(obj)->lock.base)
 
-- 
cgit v1.2.3


From 4f627af8e6068892cafe031df6c14e8a0aaaa426 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 2 Sep 2021 16:10:11 -0500
Subject: ptrace: Remove the unnecessary arguments from arch_ptrace_stop

Both arch_ptrace_stop_needed and arch_ptrace_stop are called with an
exit_code and a siginfo structure.  Neither argument is used by any of
the implementations so just remove the unneeded arguments.

The two arechitectures that implement arch_ptrace_stop are ia64 and
sparc.  Both architectures flush their register stacks before a
ptrace_stack so that all of the register information can be accessed
by debuggers.

As the question of if a register stack needs to be flushed is
independent of why ptrace is stopping not needing arguments make sense.

Cc: David Miller <davem@davemloft.net>
Cc: sparclinux@vger.kernel.org
Link: https://lkml.kernel.org/r/87lf3mx290.fsf@disp2133
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/ptrace.h | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index b5ebf6c01292..8aee2945ff08 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -362,29 +362,25 @@ static inline void user_single_step_report(struct pt_regs *regs)
 #ifndef arch_ptrace_stop_needed
 /**
  * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called
- * @code:	current->exit_code value ptrace will stop with
- * @info:	siginfo_t pointer (or %NULL) for signal ptrace will stop with
  *
  * This is called with the siglock held, to decide whether or not it's
- * necessary to release the siglock and call arch_ptrace_stop() with the
- * same @code and @info arguments.  It can be defined to a constant if
- * arch_ptrace_stop() is never required, or always is.  On machines where
- * this makes sense, it should be defined to a quick test to optimize out
- * calling arch_ptrace_stop() when it would be superfluous.  For example,
- * if the thread has not been back to user mode since the last stop, the
- * thread state might indicate that nothing needs to be done.
+ * necessary to release the siglock and call arch_ptrace_stop().  It can be
+ * defined to a constant if arch_ptrace_stop() is never required, or always
+ * is.  On machines where this makes sense, it should be defined to a quick
+ * test to optimize out calling arch_ptrace_stop() when it would be
+ * superfluous.  For example, if the thread has not been back to user mode
+ * since the last stop, the thread state might indicate that nothing needs
+ * to be done.
  *
  * This is guaranteed to be invoked once before a task stops for ptrace and
  * may include arch-specific operations necessary prior to a ptrace stop.
  */
-#define arch_ptrace_stop_needed(code, info)	(0)
+#define arch_ptrace_stop_needed()	(0)
 #endif
 
 #ifndef arch_ptrace_stop
 /**
  * arch_ptrace_stop - Do machine-specific work before stopping for ptrace
- * @code:	current->exit_code value ptrace will stop with
- * @info:	siginfo_t pointer (or %NULL) for signal ptrace will stop with
  *
  * This is called with no locks held when arch_ptrace_stop_needed() has
  * just returned nonzero.  It is allowed to block, e.g. for user memory
@@ -394,7 +390,7 @@ static inline void user_single_step_report(struct pt_regs *regs)
  * we only do it when the arch requires it for this particular stop, as
  * indicated by arch_ptrace_stop_needed().
  */
-#define arch_ptrace_stop(code, info)		do { } while (0)
+#define arch_ptrace_stop()		do { } while (0)
 #endif
 
 #ifndef current_pt_regs
-- 
cgit v1.2.3


From 92307383082daff5df884a25df9e283efb7ef261 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 1 Sep 2021 11:33:50 -0500
Subject: coredump:  Don't perform any cleanups before dumping core

Rename coredump_exit_mm to coredump_task_exit and call it from do_exit
before PTRACE_EVENT_EXIT, and before any cleanup work for a task
happens.  This ensures that an accurate copy of the process can be
captured in the coredump as no cleanup for the process happens before
the coredump completes.  This also ensures that PTRACE_EVENT_EXIT
will not be visited by any thread until the coredump is complete.

Add a new flag PF_POSTCOREDUMP so that tasks that have passed through
coredump_task_exit can be recognized and ignored in zap_process.

Now that all of the coredumping happens before exit_mm remove code to
test for a coredump in progress from mm_release.

Replace "may_ptrace_stop()" with a simple test of "current->ptrace".
The other tests in may_ptrace_stop all concern avoiding stopping
during a coredump.  These tests are no longer necessary as it is now
guaranteed that fatal_signal_pending will be set if the code enters
ptrace_stop during a coredump.  The code in ptrace_stop is guaranteed
not to stop if fatal_signal_pending returns true.

Until this change "ptrace_event(PTRACE_EVENT_EXIT)" could call
ptrace_stop without fatal_signal_pending being true, as signals are
dequeued in get_signal before calling do_exit.  This is no longer
an issue as "ptrace_event(PTRACE_EVENT_EXIT)" is no longer reached
until after the coredump completes.

Link: https://lkml.kernel.org/r/874kaax26c.fsf@disp2133
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e12b524426b0..f3741f23935e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1664,6 +1664,7 @@ extern struct pid *cad_pid;
 #define PF_VCPU			0x00000001	/* I'm a virtual CPU */
 #define PF_IDLE			0x00000002	/* I am an IDLE thread */
 #define PF_EXITING		0x00000004	/* Getting shut down */
+#define PF_POSTCOREDUMP		0x00000008	/* Coredumps should ignore this task */
 #define PF_IO_WORKER		0x00000010	/* Task is an IO worker */
 #define PF_WQ_WORKER		0x00000020	/* I'm a workqueue worker */
 #define PF_FORKNOEXEC		0x00000040	/* Forked but didn't exec */
-- 
cgit v1.2.3


From 353407d917b2d87cd8104a0453d012439c6ca4be Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 6 Oct 2021 13:46:42 +0300
Subject: ethtool: Add ability to control transceiver modules' power mode

Add a pair of new ethtool messages, 'ETHTOOL_MSG_MODULE_SET' and
'ETHTOOL_MSG_MODULE_GET', that can be used to control transceiver
modules parameters and retrieve their status.

The first parameter to control is the power mode of the module. It is
only relevant for paged memory modules, as flat memory modules always
operate in low power mode.

When a paged memory module is in low power mode, its power consumption
is reduced to the minimum, the management interface towards the host is
available and the data path is deactivated.

User space can choose to put modules that are not currently in use in
low power mode and transition them to high power mode before putting the
associated ports administratively up. This is useful for user space that
favors reduced power consumption and lower temperatures over reduced
link up times. In QSFP-DD modules the transition from low power mode to
high power mode can take a few seconds and this transition is only
expected to get longer with future / more complex modules.

User space can control the power mode of the module via the power mode
policy attribute ('ETHTOOL_A_MODULE_POWER_MODE_POLICY'). Possible
values:

* high: Module is always in high power mode.

* auto: Module is transitioned by the host to high power mode when the
  first port using it is put administratively up and to low power mode
  when the last port using it is put administratively down.

The operational power mode of the module is available to user space via
the 'ETHTOOL_A_MODULE_POWER_MODE' attribute. The attribute is not
reported to user space when a module is not plugged-in.

The user API is designed to be generic enough so that it could be used
for modules with different memory maps (e.g., SFF-8636, CMIS).

The only implementation of the device driver API in this series is for a
MAC driver (mlxsw) where the module is controlled by the device's
firmware, but it is designed to be generic enough so that it could also
be used by implementations where the module is controlled by the CPU.

CMIS testing
============

 # ethtool -m swp11
 Identifier                                : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
 ...
 Module State                              : 0x03 (ModuleReady)
 LowPwrAllowRequestHW                      : Off
 LowPwrRequestSW                           : Off

The module is not in low power mode, as it is not forced by hardware
(LowPwrAllowRequestHW is off) or by software (LowPwrRequestSW is off).

The power mode can be queried from the kernel. In case
LowPwrAllowRequestHW was on, the kernel would need to take into account
the state of the LowPwrRequestHW signal, which is not visible to user
space.

 $ ethtool --show-module swp11
 Module parameters for swp11:
 power-mode-policy high
 power-mode high

Change the power mode policy to 'auto':

 # ethtool --set-module swp11 power-mode-policy auto

Query the power mode again:

 $ ethtool --show-module swp11
 Module parameters for swp11:
 power-mode-policy auto
 power-mode low

Verify with the data read from the EEPROM:

 # ethtool -m swp11
 Identifier                                : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
 ...
 Module State                              : 0x01 (ModuleLowPwr)
 LowPwrAllowRequestHW                      : Off
 LowPwrRequestSW                           : On

Put the associated port administratively up which will instruct the host
to transition the module to high power mode:

 # ip link set dev swp11 up

Query the power mode again:

 $ ethtool --show-module swp11
 Module parameters for swp11:
 power-mode-policy auto
 power-mode high

Verify with the data read from the EEPROM:

 # ethtool -m swp11
 Identifier                                : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
 ...
 Module State                              : 0x03 (ModuleReady)
 LowPwrAllowRequestHW                      : Off
 LowPwrRequestSW                           : Off

Put the associated port administratively down which will instruct the
host to transition the module to low power mode:

 # ip link set dev swp11 down

Query the power mode again:

 $ ethtool --show-module swp11
 Module parameters for swp11:
 power-mode-policy auto
 power-mode low

Verify with the data read from the EEPROM:

 # ethtool -m swp11
 Identifier                                : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
 ...
 Module State                              : 0x01 (ModuleLowPwr)
 LowPwrAllowRequestHW                      : Off
 LowPwrRequestSW                           : On

SFF-8636 testing
================

 # ethtool -m swp13
 Identifier                                : 0x11 (QSFP28)
 ...
 Extended identifier description           : 5.0W max. Power consumption,  High Power Class (> 3.5 W) enabled
 Power set                                 : Off
 Power override                            : On
 ...
 Transmit avg optical power (Channel 1)    : 0.7733 mW / -1.12 dBm
 Transmit avg optical power (Channel 2)    : 0.7649 mW / -1.16 dBm
 Transmit avg optical power (Channel 3)    : 0.7790 mW / -1.08 dBm
 Transmit avg optical power (Channel 4)    : 0.7837 mW / -1.06 dBm
 Rcvr signal avg optical power(Channel 1)  : 0.9302 mW / -0.31 dBm
 Rcvr signal avg optical power(Channel 2)  : 0.9079 mW / -0.42 dBm
 Rcvr signal avg optical power(Channel 3)  : 0.8993 mW / -0.46 dBm
 Rcvr signal avg optical power(Channel 4)  : 0.8778 mW / -0.57 dBm

The module is not in low power mode, as it is not forced by hardware
(Power override is on) or by software (Power set is off).

The power mode can be queried from the kernel. In case Power override
was off, the kernel would need to take into account the state of the
LPMode signal, which is not visible to user space.

 $ ethtool --show-module swp13
 Module parameters for swp13:
 power-mode-policy high
 power-mode high

Change the power mode policy to 'auto':

 # ethtool --set-module swp13 power-mode-policy auto

Query the power mode again:

 $ ethtool --show-module swp13
 Module parameters for swp13:
 power-mode-policy auto
 power-mode low

Verify with the data read from the EEPROM:

 # ethtool -m swp13
 Identifier                                : 0x11 (QSFP28)
 Extended identifier description           : 5.0W max. Power consumption,  High Power Class (> 3.5 W) not enabled
 Power set                                 : On
 Power override                            : On
 ...
 Transmit avg optical power (Channel 1)    : 0.0000 mW / -inf dBm
 Transmit avg optical power (Channel 2)    : 0.0000 mW / -inf dBm
 Transmit avg optical power (Channel 3)    : 0.0000 mW / -inf dBm
 Transmit avg optical power (Channel 4)    : 0.0000 mW / -inf dBm
 Rcvr signal avg optical power(Channel 1)  : 0.0000 mW / -inf dBm
 Rcvr signal avg optical power(Channel 2)  : 0.0000 mW / -inf dBm
 Rcvr signal avg optical power(Channel 3)  : 0.0000 mW / -inf dBm
 Rcvr signal avg optical power(Channel 4)  : 0.0000 mW / -inf dBm

Put the associated port administratively up which will instruct the host
to transition the module to high power mode:

 # ip link set dev swp13 up

Query the power mode again:

 $ ethtool --show-module swp13
 Module parameters for swp13:
 power-mode-policy auto
 power-mode high

Verify with the data read from the EEPROM:

 # ethtool -m swp13
 Identifier                                : 0x11 (QSFP28)
 ...
 Extended identifier description           : 5.0W max. Power consumption,  High Power Class (> 3.5 W) enabled
 Power set                                 : Off
 Power override                            : On
 ...
 Transmit avg optical power (Channel 1)    : 0.7934 mW / -1.01 dBm
 Transmit avg optical power (Channel 2)    : 0.7859 mW / -1.05 dBm
 Transmit avg optical power (Channel 3)    : 0.7885 mW / -1.03 dBm
 Transmit avg optical power (Channel 4)    : 0.7985 mW / -0.98 dBm
 Rcvr signal avg optical power(Channel 1)  : 0.9325 mW / -0.30 dBm
 Rcvr signal avg optical power(Channel 2)  : 0.9034 mW / -0.44 dBm
 Rcvr signal avg optical power(Channel 3)  : 0.9086 mW / -0.42 dBm
 Rcvr signal avg optical power(Channel 4)  : 0.8885 mW / -0.51 dBm

Put the associated port administratively down which will instruct the
host to transition the module to low power mode:

 # ip link set dev swp13 down

Query the power mode again:

 $ ethtool --show-module swp13
 Module parameters for swp13:
 power-mode-policy auto
 power-mode low

Verify with the data read from the EEPROM:

 # ethtool -m swp13
 Identifier                                : 0x11 (QSFP28)
 ...
 Extended identifier description           : 5.0W max. Power consumption,  High Power Class (> 3.5 W) not enabled
 Power set                                 : On
 Power override                            : On
 ...
 Transmit avg optical power (Channel 1)    : 0.0000 mW / -inf dBm
 Transmit avg optical power (Channel 2)    : 0.0000 mW / -inf dBm
 Transmit avg optical power (Channel 3)    : 0.0000 mW / -inf dBm
 Transmit avg optical power (Channel 4)    : 0.0000 mW / -inf dBm
 Rcvr signal avg optical power(Channel 1)  : 0.0000 mW / -inf dBm
 Rcvr signal avg optical power(Channel 2)  : 0.0000 mW / -inf dBm
 Rcvr signal avg optical power(Channel 3)  : 0.0000 mW / -inf dBm
 Rcvr signal avg optical power(Channel 4)  : 0.0000 mW / -inf dBm

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 849524b55d89..9adf8d2c3144 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -415,6 +415,17 @@ struct ethtool_module_eeprom {
 	u8	*data;
 };
 
+/**
+ * struct ethtool_module_power_mode_params - module power mode parameters
+ * @policy: The power mode policy enforced by the host for the plug-in module.
+ * @mode: The operational power mode of the plug-in module. Should be filled by
+ *	device drivers on get operations.
+ */
+struct ethtool_module_power_mode_params {
+	enum ethtool_module_power_mode_policy policy;
+	enum ethtool_module_power_mode mode;
+};
+
 /**
  * struct ethtool_ops - optional netdev operations
  * @cap_link_lanes_supported: indicates if the driver supports lanes
@@ -580,6 +591,11 @@ struct ethtool_module_eeprom {
  * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics.
  * @get_rmon_stats: Query some of the RMON (RFC 2819) statistics.
  *	Set %ranges to a pointer to zero-terminated array of byte ranges.
+ * @get_module_power_mode: Get the power mode policy for the plug-in module
+ *	used by the network device and its operational power mode, if
+ *	plugged-in.
+ * @set_module_power_mode: Set the power mode policy for the plug-in module
+ *	used by the network device.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -705,6 +721,12 @@ struct ethtool_ops {
 	void	(*get_rmon_stats)(struct net_device *dev,
 				  struct ethtool_rmon_stats *rmon_stats,
 				  const struct ethtool_rmon_hist_range **ranges);
+	int	(*get_module_power_mode)(struct net_device *dev,
+					 struct ethtool_module_power_mode_params *params,
+					 struct netlink_ext_ack *extack);
+	int	(*set_module_power_mode)(struct net_device *dev,
+					 const struct ethtool_module_power_mode_params *params,
+					 struct netlink_ext_ack *extack);
 };
 
 int ethtool_check_ops(const struct ethtool_ops *ops);
-- 
cgit v1.2.3


From 3dfb51126064b594470b9c0b278188fbc9194709 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 6 Oct 2021 13:46:46 +0300
Subject: ethtool: Add transceiver module extended state

Add an extended state and sub-state to describe link issues related to
transceiver modules.

The 'ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY' extended sub-state
tells user space that port is unable to gain a carrier because the CMIS
Module State Machine did not reach the ModuleReady (Fully Operational)
state. For example, if the module is stuck at ModuleLowPwr or
ModuleFault state. In case of the latter, user space can read the fault
reason from the module's EEPROM and potentially reset it.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 9adf8d2c3144..845a0ffc16ee 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -94,6 +94,7 @@ struct ethtool_link_ext_state_info {
 		enum ethtool_link_ext_substate_link_logical_mismatch link_logical_mismatch;
 		enum ethtool_link_ext_substate_bad_signal_integrity bad_signal_integrity;
 		enum ethtool_link_ext_substate_cable_issue cable_issue;
+		enum ethtool_link_ext_substate_module module;
 		u8 __link_ext_substate;
 	};
 };
-- 
cgit v1.2.3


From 79365f36d1de87286bb4fc0abcb2a01678ef4bef Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Wed, 6 Oct 2021 13:19:20 +0100
Subject: net: mdio: add mdiobus_modify_changed()

Add mdiobus_modify_changed() helper to reflect the phylib and similar
equivalents. This will avoid this functionality being open-coded, as
has already happened in phylink, and it looks like other users will be
appearing soon.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mdio.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 5e6dc38f418e..f622888a4ba8 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -349,6 +349,8 @@ int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val);
 int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val);
 int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask,
 		   u16 set);
+int mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum,
+			   u16 mask, u16 set);
 
 static inline u32 mdiobus_c45_addr(int devad, u16 regnum)
 {
-- 
cgit v1.2.3


From af8cc9600bbf2251b04c56139f7c83f87c3f878a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 Sep 2021 14:10:51 -0300
Subject: futex: Split out syscalls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Put the syscalls in their own little file.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: André Almeida <andrealmeid@collabora.com>
Link: https://lore.kernel.org/r/20210923171111.300673-3-andrealmeid@collabora.com
---
 include/linux/syscalls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 252243c7783d..25979682ade5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -610,7 +610,7 @@ asmlinkage long sys_waitid(int which, pid_t pid,
 asmlinkage long sys_set_tid_address(int __user *tidptr);
 asmlinkage long sys_unshare(unsigned long unshare_flags);
 
-/* kernel/futex.c */
+/* kernel/futex/syscalls.c */
 asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 			  const struct __kernel_timespec __user *utime,
 			  u32 __user *uaddr2, u32 val3);
-- 
cgit v1.2.3


From bf69bad38cf63d980e8a603f8d1bd1f85b5ed3d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
Date: Thu, 23 Sep 2021 14:11:05 -0300
Subject: futex: Implement sys_futex_waitv()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support to wait on multiple futexes. This is the interface
implemented by this syscall:

futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes,
	    unsigned int flags, struct timespec *timeout, clockid_t clockid)

struct futex_waitv {
	__u64 val;
	__u64 uaddr;
	__u32 flags;
	__u32 __reserved;
};

Given an array of struct futex_waitv, wait on each uaddr. The thread
wakes if a futex_wake() is performed at any uaddr. The syscall returns
immediately if any waiter has *uaddr != val. *timeout is an optional
absolute timeout value for the operation. This syscall supports only
64bit sized timeout structs. The flags argument of the syscall should be
empty, but it can be used for future extensions. Flags for shared
futexes, sizes, etc. should be used on the individual flags of each
waiter.

__reserved is used for explicit padding and should be 0, but it might be
used for future extensions. If the userspace uses 32-bit pointers, it
should make sure to explicitly cast it when assigning to waitv::uaddr.

Returns the array index of one of the woken futexes. There’s no given
information of how many were woken, or any particular attribute of it
(if it’s the first woken, if it is of the smaller index...).

Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210923171111.300673-17-andrealmeid@collabora.com
---
 include/linux/syscalls.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 25979682ade5..528a478dbda8 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -58,6 +58,7 @@ struct mq_attr;
 struct compat_stat;
 struct old_timeval32;
 struct robust_list_head;
+struct futex_waitv;
 struct getcpu_cache;
 struct old_linux_dirent;
 struct perf_event_attr;
@@ -623,6 +624,10 @@ asmlinkage long sys_get_robust_list(int pid,
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
+asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
+				unsigned int nr_futexes, unsigned int flags,
+				struct __kernel_timespec __user *timeout, clockid_t clockid);
+
 /* kernel/hrtimer.c */
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
-- 
cgit v1.2.3


From 9b3c4ab3045e953670c7de9c1165fae5358a7237 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Sep 2021 21:54:32 +0200
Subject: sched,rcu: Rework try_invoke_on_locked_down_task()

Give try_invoke_on_locked_down_task() a saner name and have it return
an int so that the caller might distinguish between different reasons
of failure.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Vasily Gorbik <gor@linux.ibm.com>
Tested-by: Vasily Gorbik <gor@linux.ibm.com> # on s390
Link: https://lkml.kernel.org/r/20210929152428.649944917@infradead.org
---
 include/linux/wait.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 93dab0e9580f..2d0df57c9902 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -1160,6 +1160,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
 		(wait)->flags = 0;						\
 	} while (0)
 
-bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg);
+typedef int (*task_call_f)(struct task_struct *p, void *arg);
+extern int task_call_func(struct task_struct *p, task_call_f func, void *arg);
 
 #endif /* _LINUX_WAIT_H */
-- 
cgit v1.2.3


From e330fb14590c5c80f7195c3d8c9b4bcf79e1a5cd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 6 Oct 2021 18:06:54 -0700
Subject: of: net: move of_net under net/

Rob suggests to move of_net.c from under drivers/of/ somewhere
to the networking code.

Suggested-by: Rob Herring <robh@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/of_net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of_net.h b/include/linux/of_net.h
index daef3b0d9270..cf31188329b5 100644
--- a/include/linux/of_net.h
+++ b/include/linux/of_net.h
@@ -8,7 +8,7 @@
 
 #include <linux/phy.h>
 
-#ifdef CONFIG_OF_NET
+#ifdef CONFIG_OF
 #include <linux/of.h>
 
 struct net_device;
-- 
cgit v1.2.3


From d466effe282ddbab6acb6c3120c1de0ee1b86d57 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 6 Oct 2021 18:06:55 -0700
Subject: of: net: add a helper for loading netdev->dev_addr

Commit 406f42fa0d3c ("net-next: When a bond have a massive amount
of VLANs...") introduced a rbtree for faster Ethernet address look
up. To maintain netdev->dev_addr in this tree we need to make all
the writes to it got through appropriate helpers.

There are roughly 40 places where netdev->dev_addr is passed
as the destination to a of_get_mac_address() call. Add a helper
which takes a dev pointer instead, so it can call an appropriate
helper.

Note that of_get_mac_address() already assumes the address is
6 bytes long (ETH_ALEN) so use eth_hw_addr_set().

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/of_net.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of_net.h b/include/linux/of_net.h
index cf31188329b5..0797e2edb8c2 100644
--- a/include/linux/of_net.h
+++ b/include/linux/of_net.h
@@ -14,6 +14,7 @@
 struct net_device;
 extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface);
 extern int of_get_mac_address(struct device_node *np, u8 *mac);
+int of_get_ethdev_address(struct device_node *np, struct net_device *dev);
 extern struct net_device *of_find_net_device_by_node(struct device_node *np);
 #else
 static inline int of_get_phy_mode(struct device_node *np,
@@ -27,6 +28,11 @@ static inline int of_get_mac_address(struct device_node *np, u8 *mac)
 	return -ENODEV;
 }
 
+static inline int of_get_ethdev_address(struct device_node *np, struct net_device *dev)
+{
+	return -ENODEV;
+}
+
 static inline struct net_device *of_find_net_device_by_node(struct device_node *np)
 {
 	return NULL;
-- 
cgit v1.2.3


From 433baf0719d6a81d0587ea27545a120a3880abf6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 6 Oct 2021 18:06:57 -0700
Subject: device property: move mac addr helpers to eth.c

Move the mac address helpers out, eth.c already contains
a bunch of similar helpers.

Suggested-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 6 ++++++
 include/linux/property.h    | 4 ----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index e7b2e5fd8d24..39f0758274ae 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -26,9 +26,15 @@
 
 #ifdef __KERNEL__
 struct device;
+struct fwnode_handle;
+
 int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
 unsigned char *arch_get_platform_mac_address(void);
 int nvmem_get_mac_address(struct device *dev, void *addrbuf);
+void *device_get_mac_address(struct device *dev, char *addr, int alen);
+void *fwnode_get_mac_address(struct fwnode_handle *fwnode,
+			     char *addr, int alen);
+
 u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len);
 __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
 extern const struct header_ops eth_header_ops;
diff --git a/include/linux/property.h b/include/linux/property.h
index 357513a977e5..4fb081684255 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -389,11 +389,7 @@ const void *device_get_match_data(struct device *dev);
 
 int device_get_phy_mode(struct device *dev);
 
-void *device_get_mac_address(struct device *dev, char *addr, int alen);
-
 int fwnode_get_phy_mode(struct fwnode_handle *fwnode);
-void *fwnode_get_mac_address(struct fwnode_handle *fwnode,
-			     char *addr, int alen);
 struct fwnode_handle *fwnode_graph_get_next_endpoint(
 	const struct fwnode_handle *fwnode, struct fwnode_handle *prev);
 struct fwnode_handle *
-- 
cgit v1.2.3


From 8017c4d8173cfe086420dc5710d631cabd03ef67 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 6 Oct 2021 18:06:58 -0700
Subject: eth: fwnode: change the return type of mac address helpers

fwnode_get_mac_address() and device_get_mac_address()
return a pointer to the buffer that was passed to them
on success or NULL on failure. None of the callers
care about the actual value, only if it's NULL or not.

These semantics differ from of_get_mac_address() which
returns an int so to avoid confusion make the device
helpers return an errno.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 39f0758274ae..8299f1cd9175 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -31,9 +31,8 @@ struct fwnode_handle;
 int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
 unsigned char *arch_get_platform_mac_address(void);
 int nvmem_get_mac_address(struct device *dev, void *addrbuf);
-void *device_get_mac_address(struct device *dev, char *addr, int alen);
-void *fwnode_get_mac_address(struct fwnode_handle *fwnode,
-			     char *addr, int alen);
+int device_get_mac_address(struct device *dev, char *addr, int alen);
+int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr, int alen);
 
 u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len);
 __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
-- 
cgit v1.2.3


From 0a14501ed818ff51eed237bbe5009d0d784e4450 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 6 Oct 2021 18:06:59 -0700
Subject: eth: fwnode: remove the addr len from mac helpers

All callers pass in ETH_ALEN and the function itself
will return -EINVAL for any other address length.
Just assume it's ETH_ALEN like all other mac address
helpers (nvm, of, platform).

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 8299f1cd9175..bb612c7382e3 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -31,8 +31,8 @@ struct fwnode_handle;
 int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
 unsigned char *arch_get_platform_mac_address(void);
 int nvmem_get_mac_address(struct device *dev, void *addrbuf);
-int device_get_mac_address(struct device *dev, char *addr, int alen);
-int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr, int alen);
+int device_get_mac_address(struct device *dev, char *addr);
+int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr);
 
 u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len);
 __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
-- 
cgit v1.2.3


From d9eb44904e87c8ad1da0240849dbab638bacb799 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 6 Oct 2021 18:07:00 -0700
Subject: eth: fwnode: add a helper for loading netdev->dev_addr

Commit 406f42fa0d3c ("net-next: When a bond have a massive amount
of VLANs...") introduced a rbtree for faster Ethernet address look
up. To maintain netdev->dev_addr in this tree we need to make all
the writes to it got through appropriate helpers.

There is a handful of drivers which pass netdev->dev_addr as
the destination buffer to device_get_mac_address(). Add a helper
which takes a dev pointer instead, so it can call an appropriate
helper.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 1 +
 include/linux/property.h    | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index bb612c7382e3..a8bb64cf4079 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -32,6 +32,7 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
 unsigned char *arch_get_platform_mac_address(void);
 int nvmem_get_mac_address(struct device *dev, void *addrbuf);
 int device_get_mac_address(struct device *dev, char *addr);
+int device_get_ethdev_address(struct device *dev, struct net_device *netdev);
 int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr);
 
 u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len);
diff --git a/include/linux/property.h b/include/linux/property.h
index 4fb081684255..88fa726a76df 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -15,6 +15,7 @@
 #include <linux/types.h>
 
 struct device;
+struct net_device;
 
 enum dev_prop_type {
 	DEV_PROP_U8,
-- 
cgit v1.2.3


From 5baaac3184ab896d74993825858f1b1a46c460ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Tue, 15 Jun 2021 15:10:03 +0200
Subject: dma-buf: add dma_resv_for_each_fence v3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A simpler version of the iterator to be used when the dma_resv object is
locked.

v2: fix index check here as well
v3: minor coding improvement, some documentation cleanup

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211006123609.2026-1-christian.koenig@amd.com
---
 include/linux/dma-resv.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index 764138ad8583..491359cea54c 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -179,6 +179,8 @@ struct dma_resv_iter {
 
 struct dma_fence *dma_resv_iter_first_unlocked(struct dma_resv_iter *cursor);
 struct dma_fence *dma_resv_iter_next_unlocked(struct dma_resv_iter *cursor);
+struct dma_fence *dma_resv_iter_first(struct dma_resv_iter *cursor);
+struct dma_fence *dma_resv_iter_next(struct dma_resv_iter *cursor);
 
 /**
  * dma_resv_iter_begin - initialize a dma_resv_iter object
@@ -244,6 +246,24 @@ static inline bool dma_resv_iter_is_restarted(struct dma_resv_iter *cursor)
 	for (fence = dma_resv_iter_first_unlocked(cursor);		\
 	     fence; fence = dma_resv_iter_next_unlocked(cursor))
 
+/**
+ * dma_resv_for_each_fence - fence iterator
+ * @cursor: a struct dma_resv_iter pointer
+ * @obj: a dma_resv object pointer
+ * @all_fences: true if all fences should be returned
+ * @fence: the current fence
+ *
+ * Iterate over the fences in a struct dma_resv object while holding the
+ * &dma_resv.lock. @all_fences controls if the shared fences are returned as
+ * well. The cursor initialisation is part of the iterator and the fence stays
+ * valid as long as the lock is held and so no extra reference to the fence is
+ * taken.
+ */
+#define dma_resv_for_each_fence(cursor, obj, all_fences, fence)	\
+	for (dma_resv_iter_begin(cursor, obj, all_fences),	\
+	     fence = dma_resv_iter_first(cursor); fence;	\
+	     fence = dma_resv_iter_next(cursor))
+
 #define dma_resv_held(obj) lockdep_is_held(&(obj)->lock.base)
 #define dma_resv_assert_held(obj) lockdep_assert_held(&(obj)->lock.base)
 
-- 
cgit v1.2.3


From bdc7ca008e1f5539e891187032cb2cbbc3decb5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Thu, 7 Oct 2021 14:14:13 +0200
Subject: spi: Remove unused function spi_busnum_to_master()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The last user is gone since commit

	2962db71c703 ("staging/fbtft: Remove fbtft_device")

in 2019.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20211007121415.2401638-3-u.kleine-koenig@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 8371bca13729..f8e322a46616 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -760,8 +760,6 @@ extern int devm_spi_register_controller(struct device *dev,
 					struct spi_controller *ctlr);
 extern void spi_unregister_controller(struct spi_controller *ctlr);
 
-extern struct spi_controller *spi_busnum_to_master(u16 busnum);
-
 /*
  * SPI resource management while processing a SPI message
  */
-- 
cgit v1.2.3


From da21fde0fdb393c2fbe0ae0735cc826cd55fd46f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Thu, 7 Oct 2021 14:14:15 +0200
Subject: spi: Make several public functions private to spi.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All these functions have no callers apart from drivers/spi/spi.c. So
drop their declarations in include/linux/spi/spi.h and don't export
them.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20211007121415.2401638-5-u.kleine-koenig@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 53 -------------------------------------------------
 1 file changed, 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index f8e322a46616..29e21d49aafc 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -78,10 +78,6 @@ struct spi_statistics {
 	unsigned long transfers_split_maxsize;
 };
 
-void spi_statistics_add_transfer_stats(struct spi_statistics *stats,
-				       struct spi_transfer *xfer,
-				       struct spi_controller *ctlr);
-
 #define SPI_STATISTICS_ADD_TO_FIELD(stats, field, count)	\
 	do {							\
 		unsigned long flags;				\
@@ -783,15 +779,6 @@ struct spi_res {
 	unsigned long long      data[]; /* guarantee ull alignment */
 };
 
-extern void *spi_res_alloc(struct spi_device *spi,
-			   spi_res_release_t release,
-			   size_t size, gfp_t gfp);
-extern void spi_res_add(struct spi_message *message, void *res);
-extern void spi_res_free(void *res);
-
-extern void spi_res_release(struct spi_controller *ctlr,
-			    struct spi_message *message);
-
 /*---------------------------------------------------------------------------*/
 
 /*
@@ -1109,8 +1096,6 @@ static inline void spi_message_free(struct spi_message *m)
 
 extern int spi_setup(struct spi_device *spi);
 extern int spi_async(struct spi_device *spi, struct spi_message *message);
-extern int spi_async_locked(struct spi_device *spi,
-			    struct spi_message *message);
 extern int spi_slave_abort(struct spi_device *spi);
 
 static inline size_t
@@ -1193,15 +1178,6 @@ struct spi_replaced_transfers {
 	struct spi_transfer inserted_transfers[];
 };
 
-extern struct spi_replaced_transfers *spi_replace_transfers(
-	struct spi_message *msg,
-	struct spi_transfer *xfer_first,
-	size_t remove,
-	size_t insert,
-	spi_replaced_release_t release,
-	size_t extradatasize,
-	gfp_t gfp);
-
 /*---------------------------------------------------------------------------*/
 
 /* SPI transfer transformation methods */
@@ -1473,19 +1449,7 @@ spi_register_board_info(struct spi_board_info const *info, unsigned n)
  * use spi_new_device() to describe each device.  You can also call
  * spi_unregister_device() to start making that device vanish, but
  * normally that would be handled by spi_unregister_controller().
- *
- * You can also use spi_alloc_device() and spi_add_device() to use a two
- * stage registration sequence for each spi_device.  This gives the caller
- * some more control over the spi_device structure before it is registered,
- * but requires that caller to initialize fields that would otherwise
- * be defined using the board info.
  */
-extern struct spi_device *
-spi_alloc_device(struct spi_controller *ctlr);
-
-extern int
-spi_add_device(struct spi_device *spi);
-
 extern struct spi_device *
 spi_new_device(struct spi_controller *, struct spi_board_info *);
 
@@ -1500,23 +1464,6 @@ spi_transfer_is_last(struct spi_controller *ctlr, struct spi_transfer *xfer)
 	return list_is_last(&xfer->transfer_list, &ctlr->cur_msg->transfers);
 }
 
-/* OF support code */
-#if IS_ENABLED(CONFIG_OF)
-
-/* must call put_device() when done with returned spi_device device */
-extern struct spi_device *
-of_find_spi_device_by_node(struct device_node *node);
-
-#else
-
-static inline struct spi_device *
-of_find_spi_device_by_node(struct device_node *node)
-{
-	return NULL;
-}
-
-#endif /* IS_ENABLED(CONFIG_OF) */
-
 /* Compatibility layer */
 #define spi_master			spi_controller
 
-- 
cgit v1.2.3


From ba882580f211dbe4fee7f010c9d38dd879db83a6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 7 Oct 2021 11:18:46 -0700
Subject: eth: platform: add a helper for loading netdev->dev_addr

Commit 406f42fa0d3c ("net-next: When a bond have a massive amount
of VLANs...") introduced a rbtree for faster Ethernet address look
up. To maintain netdev->dev_addr in this tree we need to make all
the writes to it got through appropriate helpers.

There is a handful of drivers which pass netdev->dev_addr as
the destination buffer to eth_platform_get_mac_address().
Add a helper which takes a dev pointer instead, so it can call
an appropriate helper.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index e75116f48cd1..3cf546d2ffd1 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -29,6 +29,7 @@ struct device;
 struct fwnode_handle;
 
 int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
+int platform_get_ethdev_address(struct device *dev, struct net_device *netdev);
 unsigned char *arch_get_platform_mac_address(void);
 int nvmem_get_mac_address(struct device *dev, void *addrbuf);
 int device_get_mac_address(struct device *dev, char *addr);
-- 
cgit v1.2.3


From 75ea27d0d62281c31ee259c872dfdeb072cf5e39 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <atenart@kernel.org>
Date: Thu, 7 Oct 2021 18:16:50 +0200
Subject: net: introduce a function to check if a netdev name is in use

__dev_get_by_name is currently used to either retrieve a net device
reference using its name or to check if a name is already used by a
registered net device (per ns). In the later case there is no need to
return a reference to a net device.

Introduce a new helper, netdev_name_in_use, to check if a name is
currently used by a registered net device without leaking a reference
the corresponding net device. This helper uses netdev_name_node_lookup
instead of __dev_get_by_name as we don't need the extra logic retrieving
a reference to the corresponding net device.

Signed-off-by: Antoine Tenart <atenart@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d79163208dfd..15f4a658e436 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2955,6 +2955,7 @@ struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
 struct net_device *dev_get_by_name(struct net *net, const char *name);
 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
 struct net_device *__dev_get_by_name(struct net *net, const char *name);
+bool netdev_name_in_use(struct net *net, const char *name);
 int dev_alloc_name(struct net_device *dev, const char *name);
 int dev_open(struct net_device *dev, struct netlink_ext_ack *extack);
 void dev_close(struct net_device *dev);
-- 
cgit v1.2.3


From 0258b5fd7c7124b87e185a1a9322d2c66b1876b7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 22 Sep 2021 11:24:02 -0500
Subject: coredump: Limit coredumps to a single thread group

Today when a signal is delivered with a handler of SIG_DFL whose
default behavior is to generate a core dump not only that process but
every process that shares the mm is killed.

In the case of vfork this looks like a real world problem.  Consider
the following well defined sequence.

	if (vfork() == 0) {
		execve(...);
		_exit(EXIT_FAILURE);
	}

If a signal that generates a core dump is received after vfork but
before the execve changes the mm the process that called vfork will
also be killed (as the mm is shared).

Similarly if the execve fails after the point of no return the kernel
delivers SIGSEGV which will kill both the exec'ing process and because
the mm is shared the process that called vfork as well.

As far as I can tell this behavior is a violation of people's
reasonable expectations, POSIX, and is unnecessarily fragile when the
system is low on memory.

Solve this by making a userspace visible change to only kill a single
process/thread group.  This is possible because Jann Horn recently
modified[1] the coredump code so that the mm can safely be modified
while the coredump is happening.  With LinuxThreads long gone I don't
expect anyone to have a notice this behavior change in practice.

To accomplish this move the core_state pointer from mm_struct to
signal_struct, which allows different thread groups to coredump
simultatenously.

In zap_threads remove the work to kill anything except for the current
thread group.

v2: Remove core_state from the VM_BUG_ON_MM print to fix
    compile failure when CONFIG_DEBUG_VM is enabled.
    Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>

[1] a07279c9a8cd ("binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot")
Fixes: d89f3847def4 ("[PATCH] thread-aware coredumps, 2.5.43-C3")
History-tree: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Link: https://lkml.kernel.org/r/87y27mvnke.fsf@disp2133
Link: https://lkml.kernel.org/r/20211007144701.67592574@canb.auug.org.au
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/mm_types.h     | 13 -------------
 include/linux/sched/signal.h | 13 +++++++++++++
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f8ee09c711f..1039f6ae922c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -387,17 +387,6 @@ struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 } __randomize_layout;
 
-struct core_thread {
-	struct task_struct *task;
-	struct core_thread *next;
-};
-
-struct core_state {
-	atomic_t nr_threads;
-	struct core_thread dumper;
-	struct completion startup;
-};
-
 struct kioctx_table;
 struct mm_struct {
 	struct {
@@ -518,8 +507,6 @@ struct mm_struct {
 
 		unsigned long flags; /* Must use atomic bitops to access */
 
-		struct core_state *core_state; /* coredumping support */
-
 #ifdef CONFIG_AIO
 		spinlock_t			ioctx_lock;
 		struct kioctx_table __rcu	*ioctx_table;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index e5f4ce622ee6..a8fe2a593a3a 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -72,6 +72,17 @@ struct multiprocess_signals {
 	struct hlist_node node;
 };
 
+struct core_thread {
+	struct task_struct *task;
+	struct core_thread *next;
+};
+
+struct core_state {
+	atomic_t nr_threads;
+	struct core_thread dumper;
+	struct completion startup;
+};
+
 /*
  * NOTE! "signal_struct" does not have its own
  * locking, because a shared signal_struct always
@@ -110,6 +121,8 @@ struct signal_struct {
 	int			group_stop_count;
 	unsigned int		flags; /* see SIGNAL_* flags below */
 
+	struct core_state *core_state; /* coredumping support */
+
 	/*
 	 * PR_SET_CHILD_SUBREAPER marks a process, like a service
 	 * manager, to re-parent orphan (double-forking) child processes
-- 
cgit v1.2.3


From 8208461d3912e3e97e31bcbd4ce716e4a251a5dd Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Fri, 8 Oct 2021 15:24:27 +0300
Subject: net/mlx5: Add ifc bits to support optional counters

Adding bth_opcode field and the relevant bits. This field will be used
to capture and count congestion notification packets (CNP).

Adding source_vhca_port support bit.
This field will be used to check the capability to use the
source_vhca_port as a match criteria in cases of dual port.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 96f5fb2af811..34254dbe7117 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -342,7 +342,7 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         outer_geneve_oam[0x1];
 	u8         outer_geneve_protocol_type[0x1];
 	u8         outer_geneve_opt_len[0x1];
-	u8         reserved_at_1e[0x1];
+	u8         source_vhca_port[0x1];
 	u8         source_eswitch_port[0x1];
 
 	u8         inner_dmac[0x1];
@@ -393,6 +393,14 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         metadata_reg_c_0[0x1];
 };
 
+struct mlx5_ifc_flow_table_fields_supported_2_bits {
+	u8         reserved_at_0[0xe];
+	u8         bth_opcode[0x1];
+	u8         reserved_at_f[0x11];
+
+	u8         reserved_at_20[0x60];
+};
+
 struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         ft_support[0x1];
 	u8         reserved_at_1[0x1];
@@ -539,7 +547,7 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 	union mlx5_ifc_gre_key_bits gre_key;
 
 	u8         vxlan_vni[0x18];
-	u8         reserved_at_b8[0x8];
+	u8         bth_opcode[0x8];
 
 	u8         geneve_vni[0x18];
 	u8         reserved_at_d8[0x7];
@@ -756,7 +764,15 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer;
 
-	u8         reserved_at_e00[0x1200];
+	u8         reserved_at_e00[0x700];
+
+	struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive_rdma;
+
+	u8         reserved_at_1580[0x280];
+
+	struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_transmit_rdma;
+
+	u8         reserved_at_1880[0x780];
 
 	u8         sw_steering_nic_rx_action_drop_icm_address[0x40];
 
-- 
cgit v1.2.3


From b8dfed636fc6239396c3a2ae5f812505906cf215 Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Fri, 8 Oct 2021 15:24:28 +0300
Subject: net/mlx5: Add priorities for counters in RDMA namespaces

Add additional flow steering priorities in the RDMA namespace.
This allows adding flow counters to count filtered RDMA traffic and then
continue processing in the regular RDMA steering flow.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/device.h | 2 ++
 include/linux/mlx5/fs.h     | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 66eaf0aa7f69..ed0230ff9422 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1456,6 +1456,8 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz)
 	return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz;
 }
 
+#define MLX5_RDMA_RX_NUM_COUNTERS_PRIOS 2
+#define MLX5_RDMA_TX_NUM_COUNTERS_PRIOS 1
 #define MLX5_BY_PASS_NUM_REGULAR_PRIOS 16
 #define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 16
 #define MLX5_BY_PASS_NUM_MULTICAST_PRIOS 1
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 0106c67e8ccb..f2c3da2006d9 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -83,6 +83,8 @@ enum mlx5_flow_namespace_type {
 	MLX5_FLOW_NAMESPACE_RDMA_RX,
 	MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL,
 	MLX5_FLOW_NAMESPACE_RDMA_TX,
+	MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS,
+	MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS,
 };
 
 enum {
-- 
cgit v1.2.3


From 0392dd51f9c78d46109a408f27dc820300dcd8bd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Oct 2021 10:10:10 -0400
Subject: SUNRPC: Per-rpc_clnt task PIDs

The current range of RPC task PIDs is 0..65535. That's not adequate
for distinguishing tasks across multiple rpc_clnts running high
throughput workloads.

To help relieve this situation and to reduce the bottleneck of
having a single atomic for assigning all RPC task PIDs, assign task
PIDs per rpc_clnt.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index a4661646adc9..267b7aeaf1a6 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -40,6 +40,7 @@ struct rpc_clnt {
 	unsigned int		cl_clid;	/* client id */
 	struct list_head	cl_clients;	/* Global list of clients */
 	struct list_head	cl_tasks;	/* List of tasks */
+	atomic_t		cl_pid;		/* task PID counter */
 	spinlock_t		cl_lock;	/* spinlock */
 	struct rpc_xprt __rcu *	cl_xprt;	/* transport */
 	const struct rpc_procinfo *cl_procinfo;	/* procedure info */
-- 
cgit v1.2.3


From 0199215216978b612d4e8be11e878b87bc643033 Mon Sep 17 00:00:00 2001
From: Juhee Kang <claudiajkang@gmail.com>
Date: Sun, 10 Oct 2021 13:03:29 +0900
Subject: mlxsw: spectrum: use netif_is_macsec() instead of open code

Open code which is dev->priv_flags & IFF_MACSEC has already defined as
netif_is_macsec(). So use netif_is_macsec() instead of open code.
This patch doesn't change logic.

Signed-off-by: Juhee Kang <claudiajkang@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 15f4a658e436..0723c1314ea2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -5237,7 +5237,7 @@ static inline void netif_keep_dst(struct net_device *dev)
 static inline bool netif_reduces_vlan_mtu(struct net_device *dev)
 {
 	/* TODO: reserve and use an additional IFF bit, if we get more users */
-	return dev->priv_flags & IFF_MACSEC;
+	return netif_is_macsec(dev);
 }
 
 extern struct pernet_operations __net_initdata loopback_net_ops;
-- 
cgit v1.2.3


From bdac5c2b243f68ec15f8203c3348ae79fee8e8d8 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 16 Sep 2021 15:23:20 +0900
Subject: bootconfig: Allocate xbc_data inside xbc_init()

Allocate 'xbc_data' in the xbc_init() so that it does
not need to care about the ownership of the copied
data.

Link: https://lkml.kernel.org/r/163177339986.682366.898762699429769117.stgit@devnote2

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/bootconfig.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 537e1b991f11..62e09b788172 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -271,7 +271,7 @@ static inline int __init xbc_node_compose_key(struct xbc_node *node,
 }
 
 /* XBC node initializer */
-int __init xbc_init(char *buf, const char **emsg, int *epos);
+int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos);
 
 
 /* XBC cleanup data structures */
-- 
cgit v1.2.3


From e306220cb7b7c2948f191414ab06851e143b54c1 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 16 Sep 2021 15:23:29 +0900
Subject: bootconfig: Add xbc_get_info() for the node information

Add xbc_get_info() API which allows user to get the
number of used xbc_nodes and the size of bootconfig
data. This is also useful for checking the bootconfig
is initialized or not.

Link: https://lkml.kernel.org/r/163177340877.682366.4360676589783197627.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/bootconfig.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 62e09b788172..f955bb7eabbb 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -273,6 +273,8 @@ static inline int __init xbc_node_compose_key(struct xbc_node *node,
 /* XBC node initializer */
 int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos);
 
+/* XBC node and size information */
+int __init xbc_get_info(int *node_size, size_t *data_size);
 
 /* XBC cleanup data structures */
 void __init xbc_destroy_all(void);
-- 
cgit v1.2.3


From 115d4d08aeb942133d025a425dd611092893d774 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 17 Sep 2021 19:02:39 +0900
Subject: bootconfig: Rename xbc_destroy_all() to xbc_exit()

Avoid using this noisy name and use more calm one.
This is just a name change. No functional change.

Link: https://lkml.kernel.org/r/163187295918.2366983.5231840238429996027.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/bootconfig.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index f955bb7eabbb..7eb7a7f8ade7 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -277,7 +277,7 @@ int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos);
 int __init xbc_get_info(int *node_size, size_t *data_size);
 
 /* XBC cleanup data structures */
-void __init xbc_destroy_all(void);
+void __init xbc_exit(void);
 
 /* Debug dump functions */
 void __init xbc_debug_dump(void);
-- 
cgit v1.2.3


From 9b81c9bfff4651abb28bfa6d83c8b879e467963b Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 17 Sep 2021 19:02:53 +0900
Subject: bootconfig: Remove unused debug function

Remove unused xbc_debug_dump() from bootconfig for clean up
the code.

Link: https://lkml.kernel.org/r/163187297371.2366983.12943349701785875450.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/bootconfig.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 7eb7a7f8ade7..85cdfd381877 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -279,7 +279,4 @@ int __init xbc_get_info(int *node_size, size_t *data_size);
 /* XBC cleanup data structures */
 void __init xbc_exit(void);
 
-/* Debug dump functions */
-void __init xbc_debug_dump(void);
-
 #endif
-- 
cgit v1.2.3


From 4f292c4886bfdfc2a7191ef4ff3f7aac69d1cc3f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 17 Sep 2021 19:03:08 +0900
Subject: bootconfig: Replace u16 and u32 with uint16_t and uint32_t

Replace u16 and u32 with uint16_t and uint32_t so
that the tools/bootconfig only needs <stdint.h>.

Link: https://lkml.kernel.org/r/163187298835.2366983.9838262576854319669.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/bootconfig.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 85cdfd381877..a6f8dc51f168 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -25,10 +25,10 @@
  * The checksum will be used with the BOOTCONFIG_MAGIC and the size for
  * embedding the bootconfig in the initrd image.
  */
-static inline __init u32 xbc_calc_checksum(void *data, u32 size)
+static inline __init uint32_t xbc_calc_checksum(void *data, uint32_t size)
 {
 	unsigned char *p = data;
-	u32 ret = 0;
+	uint32_t ret = 0;
 
 	while (size--)
 		ret += *p++;
@@ -38,10 +38,10 @@ static inline __init u32 xbc_calc_checksum(void *data, u32 size)
 
 /* XBC tree node */
 struct xbc_node {
-	u16 next;
-	u16 child;
-	u16 parent;
-	u16 data;
+	uint16_t next;
+	uint16_t child;
+	uint16_t parent;
+	uint16_t data;
 } __attribute__ ((__packed__));
 
 #define XBC_KEY		0
-- 
cgit v1.2.3


From 4ee1b4cac236650979a5d9b745bb0a83efde3a46 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 17 Sep 2021 19:03:16 +0900
Subject: bootconfig: Cleanup dummy headers in tools/bootconfig

Cleanup dummy headers in tools/bootconfig/include except
for tools/bootconfig/include/linux/bootconfig.h.
For this change, I use __KERNEL__ macro to split kernel
header #include and introduce xbc_alloc_mem() and
xbc_free_mem().

Link: https://lkml.kernel.org/r/163187299574.2366983.18371329724128746091.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/bootconfig.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index a6f8dc51f168..a4665c7ab07c 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -7,8 +7,18 @@
  * Author: Masami Hiramatsu <mhiramat@kernel.org>
  */
 
+#ifdef __KERNEL__
 #include <linux/kernel.h>
 #include <linux/types.h>
+#else /* !__KERNEL__ */
+/*
+ * NOTE: This is only for tools/bootconfig, because tools/bootconfig will
+ * run the parser sanity test.
+ * This does NOT mean linux/bootconfig.h is available in the user space.
+ * However, if you change this file, please make sure the tools/bootconfig
+ * has no issue on building and running.
+ */
+#endif
 
 #define BOOTCONFIG_MAGIC	"#BOOTCONFIG\n"
 #define BOOTCONFIG_MAGIC_LEN	12
-- 
cgit v1.2.3


From 5e51cc0005c6ed1b793c228632f36269615f7c31 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Fri, 8 Oct 2021 10:50:07 +0100
Subject: dma-resv: Fix dma_resv_get_fences and dma_resv_copy_fences after
 conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cache the count of shared fences in the iterator to avoid dereferencing
the dma_resv_object outside the RCU protection. Otherwise iterator and its
users can observe an incosistent state which makes it impossible to use
safely. Such as:

<6> [187.517041] [IGT] gem_sync: executing
<7> [187.536343] i915 0000:00:02.0: [drm:i915_gem_context_create_ioctl [i915]] HW context 1 created
<7> [187.536793] i915 0000:00:02.0: [drm:i915_gem_context_create_ioctl [i915]] HW context 1 created
<6> [187.551235] [IGT] gem_sync: starting subtest basic-many-each
<1> [188.935462] BUG: kernel NULL pointer dereference, address: 0000000000000010
<1> [188.935485] #PF: supervisor write access in kernel mode
<1> [188.935495] #PF: error_code(0x0002) - not-present page
<6> [188.935504] PGD 0 P4D 0
<4> [188.935512] Oops: 0002 [#1] PREEMPT SMP NOPTI
<4> [188.935521] CPU: 2 PID: 1467 Comm: gem_sync Not tainted 5.15.0-rc4-CI-Patchwork_21264+ #1
<4> [188.935535] Hardware name:  /NUC6CAYB, BIOS AYAPLCEL.86A.0049.2018.0508.1356 05/08/2018
<4> [188.935546] RIP: 0010:dma_resv_get_fences+0x116/0x2d0
<4> [188.935560] Code: 10 85 c0 7f c9 be 03 00 00 00 e8 15 8b df ff eb bd e8 8e c6 ff ff eb b6 41 8b 04 24 49 8b 55 00 48 89 e7 8d 48 01 41 89 0c 24 <4c> 89 34 c2 e8 41 f2 ff ff 49 89 c6 48 85 c0 75 8c 48 8b 44 24 10
<4> [188.935583] RSP: 0018:ffffc900011dbcc8 EFLAGS: 00010202
<4> [188.935593] RAX: 0000000000000000 RBX: 00000000ffffffff RCX: 0000000000000001
<4> [188.935603] RDX: 0000000000000010 RSI: ffffffff822e343c RDI: ffffc900011dbcc8
<4> [188.935613] RBP: ffffc900011dbd48 R08: ffff88812d255bb8 R09: 00000000fffffffe
<4> [188.935623] R10: 0000000000000001 R11: 0000000000000000 R12: ffffc900011dbd44
<4> [188.935633] R13: ffffc900011dbd50 R14: ffff888113d29cc0 R15: 0000000000000000
<4> [188.935643] FS:  00007f68d17e9700(0000) GS:ffff888277900000(0000) knlGS:0000000000000000
<4> [188.935655] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
<4> [188.935665] CR2: 0000000000000010 CR3: 000000012d0a4000 CR4: 00000000003506e0
<4> [188.935676] Call Trace:
<4> [188.935685]  i915_gem_object_wait+0x1ff/0x410 [i915]
<4> [188.935988]  i915_gem_wait_ioctl+0xf2/0x2a0 [i915]
<4> [188.936272]  ? i915_gem_object_wait+0x410/0x410 [i915]
<4> [188.936533]  drm_ioctl_kernel+0xae/0x140
<4> [188.936546]  drm_ioctl+0x201/0x3d0
<4> [188.936555]  ? i915_gem_object_wait+0x410/0x410 [i915]
<4> [188.936820]  ? __fget_files+0xc2/0x1c0
<4> [188.936830]  ? __fget_files+0xda/0x1c0
<4> [188.936839]  __x64_sys_ioctl+0x6d/0xa0
<4> [188.936848]  do_syscall_64+0x3a/0xb0
<4> [188.936859]  entry_SYSCALL_64_after_hwframe+0x44/0xae

If the shared object has changed during the RCU unlocked period
callers will correctly handle the restart on the next iteration.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Fixes: 96601e8a4755 ("dma-buf: use new iterator in dma_resv_copy_fences")
Fixes: d3c80698c9f5 ("dma-buf: use new iterator in dma_resv_get_fences v3")
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4274
Cc: Christian König <christian.koenig@amd.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: linux-media@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linaro-mm-sig@lists.linaro.org
Link: https://patchwork.freedesktop.org/patch/msgid/20211008095007.972693-1-tvrtko.ursulin@linux.intel.com
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
---
 include/linux/dma-resv.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index 491359cea54c..45f1d4812a37 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -170,9 +170,12 @@ struct dma_resv_iter {
 	/** @index: index into the shared fences */
 	unsigned int index;
 
-	/** @fences: the shared fences */
+	/** @fences: the shared fences; private, *MUST* not dereference  */
 	struct dma_resv_list *fences;
 
+	/** @shared_count: number of shared fences */
+	unsigned int shared_count;
+
 	/** @is_restarted: true if this is the first returned fence */
 	bool is_restarted;
 };
-- 
cgit v1.2.3


From a440943e68cd1b5a853a6f60865967b7cc2539eb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Sep 2021 08:59:58 +0200
Subject: unicode: remove the charset field from struct unicode_map

It is hardcoded and only used for a f2fs sysfs file where it can be
hardcoded just as easily.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
---
 include/linux/unicode.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index 74484d44c755..6a392cd9f076 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -6,7 +6,6 @@
 #include <linux/dcache.h>
 
 struct unicode_map {
-	const char *charset;
 	int version;
 };
 
-- 
cgit v1.2.3


From f3a9c82396006a5664f6e398d6928799d29de76e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Sep 2021 08:59:59 +0200
Subject: unicode: mark the version field in struct unicode_map unsigned

unicode version tripplets are always unsigned.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
---
 include/linux/unicode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index 6a392cd9f076..0744f81c4b5f 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -6,7 +6,7 @@
 #include <linux/dcache.h>
 
 struct unicode_map {
-	int version;
+	unsigned int version;
 };
 
 int utf8_validate(const struct unicode_map *um, const struct qstr *str);
-- 
cgit v1.2.3


From 49bd03cc7e95cb78420305ca2f5ef67497b6fa80 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Sep 2021 09:00:00 +0200
Subject: unicode: pass a UNICODE_AGE() tripple to utf8_load

Don't bother with pointless string parsing when the caller can just pass
the version in the format that the core expects.  Also remove the
fallback to the latest version that none of the callers actually uses.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
---
 include/linux/unicode.h | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index 0744f81c4b5f..77bb915fd1f0 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -5,6 +5,29 @@
 #include <linux/init.h>
 #include <linux/dcache.h>
 
+#define UNICODE_MAJ_SHIFT		16
+#define UNICODE_MIN_SHIFT		8
+
+#define UNICODE_AGE(MAJ, MIN, REV)			\
+	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
+	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
+	 ((unsigned int)(REV)))
+
+static inline u8 unicode_major(unsigned int age)
+{
+	return (age >> UNICODE_MAJ_SHIFT) & 0xff;
+}
+
+static inline u8 unicode_minor(unsigned int age)
+{
+	return (age >> UNICODE_MIN_SHIFT) & 0xff;
+}
+
+static inline u8 unicode_rev(unsigned int age)
+{
+	return age & 0xff;
+}
+
 struct unicode_map {
 	unsigned int version;
 };
@@ -29,7 +52,7 @@ int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
 int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
 		       struct qstr *str);
 
-struct unicode_map *utf8_load(const char *version);
+struct unicode_map *utf8_load(unsigned int version);
 void utf8_unload(struct unicode_map *um);
 
 #endif /* _LINUX_UNICODE_H */
-- 
cgit v1.2.3


From 6ca99ce756c27852d1ea1e555045de1c920f30ed Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Sep 2021 09:00:04 +0200
Subject: unicode: cache the normalization tables in struct unicode_map

Instead of repeatedly looking up the version add pointers to the
NFD and NFD+CF tables to struct unicode_map, and pass a
unicode_map plus index to the functions using the normalization
tables.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
---
 include/linux/unicode.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index 77bb915fd1f0..526ca8b8391a 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -5,6 +5,8 @@
 #include <linux/init.h>
 #include <linux/dcache.h>
 
+struct utf8data;
+
 #define UNICODE_MAJ_SHIFT		16
 #define UNICODE_MIN_SHIFT		8
 
@@ -28,8 +30,25 @@ static inline u8 unicode_rev(unsigned int age)
 	return age & 0xff;
 }
 
+/*
+ * Two normalization forms are supported:
+ * 1) NFDI
+ *   - Apply unicode normalization form NFD.
+ *   - Remove any Default_Ignorable_Code_Point.
+ * 2) NFDICF
+ *   - Apply unicode normalization form NFD.
+ *   - Remove any Default_Ignorable_Code_Point.
+ *   - Apply a full casefold (C + F).
+ */
+enum utf8_normalization {
+	UTF8_NFDI = 0,
+	UTF8_NFDICF,
+	UTF8_NMAX,
+};
+
 struct unicode_map {
 	unsigned int version;
+	const struct utf8data *ntab[UTF8_NMAX];
 };
 
 int utf8_validate(const struct unicode_map *um, const struct qstr *str);
-- 
cgit v1.2.3


From bf2928c7a284e31f9f91a004b5dca09c522157c0 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 10 Sep 2021 08:22:06 +0200
Subject: PCI/VPD: Add pci_read/write_vpd_any()

In certain cases we need a variant of pci_read_vpd()/pci_write_vpd() that
does not check against dev->vpd.len. Such cases are:

  - Reading VPD if dev->vpd.len isn't set yet (in pci_vpd_size())

  - Devices that map non-VPD information to arbitrary places in VPD address
    space (example: Chelsio T3 EEPROM write-protect flag)

Therefore add pci_read_vpd_any() and pci_write_vpd_any() that check against
PCI_VPD_MAX_SIZE only.

Link: https://lore.kernel.org/r/93ecce28-a158-f02a-d134-8afcaced8efe@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index cd8aa6fce204..9649bd9e468d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1350,6 +1350,8 @@ void pci_unlock_rescan_remove(void);
 /* Vital Product Data routines */
 ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
 ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
+ssize_t pci_read_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
+ssize_t pci_write_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
 
 /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
 resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx);
-- 
cgit v1.2.3


From f614fb60a1983819e83796198de2b607fba75e99 Mon Sep 17 00:00:00 2001
From: Wenbin Mei <wenbin.mei@mediatek.com>
Date: Fri, 17 Sep 2021 20:48:02 +0800
Subject: mmc: core: Add host specific tuning support for eMMC HS400 mode

This adds a ->execute_hs400_tuning() host callback to enable optional
support for host specific tuning for eMMC HS400 mode. Additionally, share
mmc_get_ext_csd() through the public host headerfile, to allow it to be
used by the host drivers, which is needed to support the HS400 tuning.

Signed-off-by: Wenbin Mei <wenbin.mei@mediatek.com>
Link: https://lore.kernel.org/r/20210917124803.22871-3-wenbin.mei@mediatek.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/host.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 0c0c9a0fdf57..b6c082e2664c 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -162,6 +162,9 @@ struct mmc_host_ops {
 	/* Prepare HS400 target operating frequency depending host driver */
 	int	(*prepare_hs400_tuning)(struct mmc_host *host, struct mmc_ios *ios);
 
+	/* Execute HS400 tuning depending host driver */
+	int	(*execute_hs400_tuning)(struct mmc_host *host, struct mmc_card *card);
+
 	/* Prepare switch to DDR during the HS400 init sequence */
 	int	(*hs400_prepare_ddr)(struct mmc_host *host);
 
@@ -634,5 +637,6 @@ static inline enum dma_data_direction mmc_get_dma_dir(struct mmc_data *data)
 
 int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error);
 int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode);
+int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd);
 
 #endif /* LINUX_MMC_HOST_H */
-- 
cgit v1.2.3


From 0ae93b99beb283438aa571a6add4eab0c077d576 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 7 Oct 2021 16:17:24 -0400
Subject: SUNRPC: Simplify the SVC dispatch code path

Micro-optimization: The last user of the generic SVC dispatch code
path has been removed, so svc_process_common() can be simplified.
This declutters the hot path so that the by-far most common case
(a dispatch function exists) is made the /only/ path.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 6263410c948a..4205a6ef4770 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -443,10 +443,7 @@ struct svc_version {
 	/* Need xprt with congestion control */
 	bool			vs_need_cong_ctrl;
 
-	/* Override dispatch function (e.g. when caching replies).
-	 * A return value of 0 means drop the request. 
-	 * vs_dispatch == NULL means use default dispatcher.
-	 */
+	/* Dispatch function */
 	int			(*vs_dispatch)(struct svc_rqst *, __be32 *);
 };
 
-- 
cgit v1.2.3


From 2b3d047870120bcd46d7cc257d19ff49328fd585 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Sep 2021 09:00:05 +0200
Subject: unicode: Add utf8-data module

utf8data.h contains a large database table which is an auto-generated
decodification trie for the unicode normalization functions.

Allow building it into a separate module.

Based on a patch from Shreeya Patel <shreeya.patel@collabora.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
---
 include/linux/unicode.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index 526ca8b8391a..4d39e6e11a95 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -6,6 +6,7 @@
 #include <linux/dcache.h>
 
 struct utf8data;
+struct utf8data_table;
 
 #define UNICODE_MAJ_SHIFT		16
 #define UNICODE_MIN_SHIFT		8
@@ -49,6 +50,7 @@ enum utf8_normalization {
 struct unicode_map {
 	unsigned int version;
 	const struct utf8data *ntab[UTF8_NMAX];
+	const struct utf8data_table *tables;
 };
 
 int utf8_validate(const struct unicode_map *um, const struct qstr *str);
-- 
cgit v1.2.3


From 8e9028b3790ddb02c407e1a4da0ab7cf82790e7e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 12 Oct 2021 15:42:59 -0500
Subject: PCI: Return NULL for to_pci_driver(NULL)

to_pci_driver() takes a pointer to a struct device_driver and uses
container_of() to find the struct pci_driver that contains it.

If given a NULL pointer to a struct device_driver, return a NULL pci_driver
pointer instead of applying container_of() to NULL.

This simplifies callers that would otherwise have to check for a NULL
pointer first.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index cd8aa6fce204..a2d62165a7f7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -900,7 +900,10 @@ struct pci_driver {
 	struct pci_dynids	dynids;
 };
 
-#define	to_pci_driver(drv) container_of(drv, struct pci_driver, driver)
+static inline struct pci_driver *to_pci_driver(struct device_driver *drv)
+{
+    return drv ? container_of(drv, struct pci_driver, driver) : NULL;
+}
 
 /**
  * PCI_DEVICE - macro used to describe a specific PCI device
-- 
cgit v1.2.3


From b3ec8cdf457e5e63d396fe1346cc788cf7c1b578 Mon Sep 17 00:00:00 2001
From: Claudio Suarez <cssk@net-c.es>
Date: Thu, 30 Sep 2021 17:10:26 +0200
Subject: fbdev: Garbage collect fbdev scrolling acceleration, part 1 (from
 TODO list)

Scroll acceleration is disabled in fbcon by hard-wiring
p->scrollmode = SCROLL_REDRAW. Remove the obsolete code in fbcon.c
and fbdev/core/

Signed-off-by: Claudio Suarez <cssk@net-c.es>
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/YVXTYqszZix9TxjJ@gineta.localdomain
---
 include/linux/fb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 5950f8f5dc74..6f3db99ab990 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -262,7 +262,7 @@ struct fb_ops {
 
 	/* Draws a rectangle */
 	void (*fb_fillrect) (struct fb_info *info, const struct fb_fillrect *rect);
-	/* Copy data from area to another */
+	/* Copy data from area to another. Obsolete. */
 	void (*fb_copyarea) (struct fb_info *info, const struct fb_copyarea *region);
 	/* Draws a image to the display */
 	void (*fb_imageblit) (struct fb_info *info, const struct fb_image *image);
-- 
cgit v1.2.3


From 5008062f1c3f5af3acf86164aa6fcc77b0c7bdce Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Wed, 13 Oct 2021 14:19:56 +0100
Subject: nvmem: core: add nvmem cell post processing callback

Some NVMEM providers have certain nvmem cells encoded, which requires
post processing before actually using it.

For example mac-address is stored in either in ascii or delimited or reverse-order.

Having a post-process callback hook to provider drivers would enable them to
do this vendor specific post processing before nvmem consumers see it.

Tested-by: Joakim Zhang <qiangqing.zhang@nxp.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20211013131957.30271-3-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-provider.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 104505e9028f..98efb7b5660d 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -19,6 +19,9 @@ typedef int (*nvmem_reg_read_t)(void *priv, unsigned int offset,
 				void *val, size_t bytes);
 typedef int (*nvmem_reg_write_t)(void *priv, unsigned int offset,
 				 void *val, size_t bytes);
+/* used for vendor specific post processing of cell data */
+typedef int (*nvmem_cell_post_process_t)(void *priv, const char *id, unsigned int offset,
+					  void *buf, size_t bytes);
 
 enum nvmem_type {
 	NVMEM_TYPE_UNKNOWN = 0,
@@ -62,6 +65,7 @@ struct nvmem_keepout {
  * @no_of_node:	Device should not use the parent's of_node even if it's !NULL.
  * @reg_read:	Callback to read data.
  * @reg_write:	Callback to write data.
+ * @cell_post_process:	Callback for vendor specific post processing of cell data
  * @size:	Device size.
  * @word_size:	Minimum read/write access granularity.
  * @stride:	Minimum read/write access stride.
@@ -92,6 +96,7 @@ struct nvmem_config {
 	bool			no_of_node;
 	nvmem_reg_read_t	reg_read;
 	nvmem_reg_write_t	reg_write;
+	nvmem_cell_post_process_t cell_post_process;
 	int	size;
 	int	word_size;
 	int	stride;
-- 
cgit v1.2.3


From 16c663642c7ec03cd4cee5fec520bb69e97babe4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 12 Oct 2021 11:57:22 -0400
Subject: SUNRPC: Replace the "__be32 *p" parameter to .pc_decode

The passed-in value of the "__be32 *p" parameter is now unused in
every server-side XDR decoder, and can be removed.

Note also that there is a line in each decoder that sets up a local
pointer to a struct xdr_stream. Passing that pointer from the
dispatcher instead saves one line per decoder function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/lockd/xdr.h  | 19 ++++++++++---------
 include/linux/lockd/xdr4.h | 19 +++++++++----------
 include/linux/sunrpc/svc.h |  3 ++-
 3 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index a98309c0121c..170ad6f5596a 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -96,18 +96,19 @@ struct nlm_reboot {
  */
 #define NLMSVC_XDRSIZE		sizeof(struct nlm_args)
 
-int	nlmsvc_decode_testargs(struct svc_rqst *, __be32 *);
+int	nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+
 int	nlmsvc_encode_testres(struct svc_rqst *, __be32 *);
-int	nlmsvc_decode_lockargs(struct svc_rqst *, __be32 *);
-int	nlmsvc_decode_cancargs(struct svc_rqst *, __be32 *);
-int	nlmsvc_decode_unlockargs(struct svc_rqst *, __be32 *);
 int	nlmsvc_encode_res(struct svc_rqst *, __be32 *);
-int	nlmsvc_decode_res(struct svc_rqst *, __be32 *);
 int	nlmsvc_encode_void(struct svc_rqst *, __be32 *);
-int	nlmsvc_decode_void(struct svc_rqst *, __be32 *);
-int	nlmsvc_decode_shareargs(struct svc_rqst *, __be32 *);
 int	nlmsvc_encode_shareres(struct svc_rqst *, __be32 *);
-int	nlmsvc_decode_notify(struct svc_rqst *, __be32 *);
-int	nlmsvc_decode_reboot(struct svc_rqst *, __be32 *);
 
 #endif /* LOCKD_XDR_H */
diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h
index 5ae766f26e04..68e14e0f2b1f 100644
--- a/include/linux/lockd/xdr4.h
+++ b/include/linux/lockd/xdr4.h
@@ -22,21 +22,20 @@
 #define	nlm4_fbig		cpu_to_be32(NLM_FBIG)
 #define	nlm4_failed		cpu_to_be32(NLM_FAILED)
 
+int	nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 
-
-int	nlm4svc_decode_testargs(struct svc_rqst *, __be32 *);
 int	nlm4svc_encode_testres(struct svc_rqst *, __be32 *);
-int	nlm4svc_decode_lockargs(struct svc_rqst *, __be32 *);
-int	nlm4svc_decode_cancargs(struct svc_rqst *, __be32 *);
-int	nlm4svc_decode_unlockargs(struct svc_rqst *, __be32 *);
 int	nlm4svc_encode_res(struct svc_rqst *, __be32 *);
-int	nlm4svc_decode_res(struct svc_rqst *, __be32 *);
 int	nlm4svc_encode_void(struct svc_rqst *, __be32 *);
-int	nlm4svc_decode_void(struct svc_rqst *, __be32 *);
-int	nlm4svc_decode_shareargs(struct svc_rqst *, __be32 *);
 int	nlm4svc_encode_shareres(struct svc_rqst *, __be32 *);
-int	nlm4svc_decode_notify(struct svc_rqst *, __be32 *);
-int	nlm4svc_decode_reboot(struct svc_rqst *, __be32 *);
 
 extern const struct rpc_version nlm_version4;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 4205a6ef4770..da3c5bc43d85 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -454,7 +454,8 @@ struct svc_procedure {
 	/* process the request: */
 	__be32			(*pc_func)(struct svc_rqst *);
 	/* XDR decode args: */
-	int			(*pc_decode)(struct svc_rqst *, __be32 *data);
+	int			(*pc_decode)(struct svc_rqst *rqstp,
+					     struct xdr_stream *xdr);
 	/* XDR encode result: */
 	int			(*pc_encode)(struct svc_rqst *, __be32 *data);
 	/* XDR free result: */
-- 
cgit v1.2.3


From c44b31c263798ec34614dd394c31ef1a2e7e716e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 12 Oct 2021 11:57:28 -0400
Subject: SUNRPC: Change return value type of .pc_decode

Returning an undecorated integer is an age-old trope, but it's
not clear (even to previous experts in this code) that the only
valid return values are 1 and 0. These functions do not return
a negative errno, rpc_stat value, or a positive length.

Document there are only two valid return values by having
.pc_decode return only true or false.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/lockd/xdr.h  | 18 +++++++++---------
 include/linux/lockd/xdr4.h | 18 +++++++++---------
 include/linux/sunrpc/svc.h |  2 +-
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index 170ad6f5596a..e1362244f909 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -96,15 +96,15 @@ struct nlm_reboot {
  */
 #define NLMSVC_XDRSIZE		sizeof(struct nlm_args)
 
-int	nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 
 int	nlmsvc_encode_testres(struct svc_rqst *, __be32 *);
 int	nlmsvc_encode_res(struct svc_rqst *, __be32 *);
diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h
index 68e14e0f2b1f..376b8f6a3763 100644
--- a/include/linux/lockd/xdr4.h
+++ b/include/linux/lockd/xdr4.h
@@ -22,15 +22,15 @@
 #define	nlm4_fbig		cpu_to_be32(NLM_FBIG)
 #define	nlm4_failed		cpu_to_be32(NLM_FAILED)
 
-int	nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 
 int	nlm4svc_encode_testres(struct svc_rqst *, __be32 *);
 int	nlm4svc_encode_res(struct svc_rqst *, __be32 *);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index da3c5bc43d85..d6109fa7a57b 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -454,7 +454,7 @@ struct svc_procedure {
 	/* process the request: */
 	__be32			(*pc_func)(struct svc_rqst *);
 	/* XDR decode args: */
-	int			(*pc_decode)(struct svc_rqst *rqstp,
+	bool			(*pc_decode)(struct svc_rqst *rqstp,
 					     struct xdr_stream *xdr);
 	/* XDR encode result: */
 	int			(*pc_encode)(struct svc_rqst *, __be32 *data);
-- 
cgit v1.2.3


From fda494411485aff91768842c532f90fb8eb54943 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 13 Oct 2021 10:41:06 -0400
Subject: SUNRPC: Replace the "__be32 *p" parameter to .pc_encode

The passed-in value of the "__be32 *p" parameter is now unused in
every server-side XDR encoder, and can be removed.

Note also that there is a line in each encoder that sets up a local
pointer to a struct xdr_stream. Passing that pointer from the
dispatcher instead saves one line per encoder function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/lockd/xdr.h  | 8 ++++----
 include/linux/lockd/xdr4.h | 8 ++++----
 include/linux/sunrpc/svc.h | 3 ++-
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index e1362244f909..d8bd26a5525e 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -106,9 +106,9 @@ bool	nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 bool	nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 bool	nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 
-int	nlmsvc_encode_testres(struct svc_rqst *, __be32 *);
-int	nlmsvc_encode_res(struct svc_rqst *, __be32 *);
-int	nlmsvc_encode_void(struct svc_rqst *, __be32 *);
-int	nlmsvc_encode_shareres(struct svc_rqst *, __be32 *);
+int	nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 
 #endif /* LOCKD_XDR_H */
diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h
index 376b8f6a3763..50677be3557d 100644
--- a/include/linux/lockd/xdr4.h
+++ b/include/linux/lockd/xdr4.h
@@ -32,10 +32,10 @@ bool	nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 bool	nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 bool	nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 
-int	nlm4svc_encode_testres(struct svc_rqst *, __be32 *);
-int	nlm4svc_encode_res(struct svc_rqst *, __be32 *);
-int	nlm4svc_encode_void(struct svc_rqst *, __be32 *);
-int	nlm4svc_encode_shareres(struct svc_rqst *, __be32 *);
+int	nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+int	nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 
 extern const struct rpc_version nlm_version4;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index d6109fa7a57b..85694cd0db66 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -457,7 +457,8 @@ struct svc_procedure {
 	bool			(*pc_decode)(struct svc_rqst *rqstp,
 					     struct xdr_stream *xdr);
 	/* XDR encode result: */
-	int			(*pc_encode)(struct svc_rqst *, __be32 *data);
+	int			(*pc_encode)(struct svc_rqst *rqstp,
+					     struct xdr_stream *xdr);
 	/* XDR free result: */
 	void			(*pc_release)(struct svc_rqst *);
 	unsigned int		pc_argsize;	/* argument struct size */
-- 
cgit v1.2.3


From 130e2054d4a652a2bd79fb1557ddcd19c053cb37 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 13 Oct 2021 10:41:13 -0400
Subject: SUNRPC: Change return value type of .pc_encode

Returning an undecorated integer is an age-old trope, but it's
not clear (even to previous experts in this code) that the only
valid return values are 1 and 0. These functions do not return
a negative errno, rpc_stat value, or a positive length.

Document there are only two valid return values by having
.pc_encode return only true or false.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/lockd/xdr.h  | 8 ++++----
 include/linux/lockd/xdr4.h | 8 ++++----
 include/linux/sunrpc/svc.h | 2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index d8bd26a5525e..398f70093cd3 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -106,9 +106,9 @@ bool	nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 bool	nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 bool	nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 
-int	nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 
 #endif /* LOCKD_XDR_H */
diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h
index 50677be3557d..9a6b55da8fd6 100644
--- a/include/linux/lockd/xdr4.h
+++ b/include/linux/lockd/xdr4.h
@@ -32,10 +32,10 @@ bool	nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 bool	nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 bool	nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 
-int	nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
-int	nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool	nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
 
 extern const struct rpc_version nlm_version4;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 85694cd0db66..0ae28ae6caf2 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -457,7 +457,7 @@ struct svc_procedure {
 	bool			(*pc_decode)(struct svc_rqst *rqstp,
 					     struct xdr_stream *xdr);
 	/* XDR encode result: */
-	int			(*pc_encode)(struct svc_rqst *rqstp,
+	bool			(*pc_encode)(struct svc_rqst *rqstp,
 					     struct xdr_stream *xdr);
 	/* XDR free result: */
 	void			(*pc_release)(struct svc_rqst *);
-- 
cgit v1.2.3


From 40af35fdf79c77e19c597e47cc8fe9a8e200de30 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 12 Oct 2021 09:06:32 -0700
Subject: netdevice: demote the type of some dev_addr_set() helpers

__dev_addr_set() and dev_addr_mod() and pretty low level,
let the arguments be void, there's no chance for confusion
in callers converted to use them. Keep u8 in dev_addr_set()
because some of the callers are converted from a loop
and we want to make sure assignments are not from an array
of a different type.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0723c1314ea2..f33af341bfb2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4643,7 +4643,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list);
 
 /* Functions used for device addresses handling */
 static inline void
-__dev_addr_set(struct net_device *dev, const u8 *addr, size_t len)
+__dev_addr_set(struct net_device *dev, const void *addr, size_t len)
 {
 	memcpy(dev->dev_addr, addr, len);
 }
@@ -4655,7 +4655,7 @@ static inline void dev_addr_set(struct net_device *dev, const u8 *addr)
 
 static inline void
 dev_addr_mod(struct net_device *dev, unsigned int offset,
-	     const u8 *addr, size_t len)
+	     const void *addr, size_t len)
 {
 	memcpy(&dev->dev_addr[offset], addr, len);
 }
-- 
cgit v1.2.3


From 06de2cd788bfa3b51137e9bd471d68029ab68103 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Tue, 12 Oct 2021 17:39:27 +0200
Subject: gpio: max730x: Make __max730x_remove() return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An spi or i2c remove callback is only called for devices that probed
successfully. In this case this implies that __max730x_probe() set a
non-NULL driver data. So the check ts == NULL is never true. With this
check dropped, __max730x_remove() returns zero unconditionally. Make it
return void instead which makes it easier to see in the callers that
there is no error to handle.

Also the return value of i2c and spi remove callbacks is ignored anyway.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 include/linux/spi/max7301.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/max7301.h b/include/linux/spi/max7301.h
index 21449067aedb..e392c53758bc 100644
--- a/include/linux/spi/max7301.h
+++ b/include/linux/spi/max7301.h
@@ -31,6 +31,6 @@ struct max7301_platform_data {
 	u32		input_pullup_active;
 };
 
-extern int __max730x_remove(struct device *dev);
+extern void __max730x_remove(struct device *dev);
 extern int __max730x_probe(struct max7301 *ts);
 #endif
-- 
cgit v1.2.3


From 6312d52838b21f5c4a5afa1269a00df4364fd354 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Wed, 13 Oct 2021 15:57:43 +0200
Subject: marvell: octeontx2: build error: unknown type name 'u64'

Building an allmodconfig kernel arm64 kernel, the following build error
shows up:

In file included from drivers/crypto/marvell/octeontx2/cn10k_cpt.c:4:
include/linux/soc/marvell/octeontx2/asm.h:38:15: error: unknown type name 'u64'
   38 | static inline u64 otx2_atomic64_fetch_add(u64 incr, u64 *ptr)
      |               ^~~

Include linux/types.h in asm.h so the compiler knows what the type
'u64' are.

Fixes: af3826db74d1 ("octeontx2-pf: Use hardware register for CQE count")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Link: https://lore.kernel.org/r/20211013135743.3826594-1-anders.roxell@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/soc/marvell/octeontx2/asm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/marvell/octeontx2/asm.h b/include/linux/soc/marvell/octeontx2/asm.h
index 0f79fd7f81a1..d683251a0b40 100644
--- a/include/linux/soc/marvell/octeontx2/asm.h
+++ b/include/linux/soc/marvell/octeontx2/asm.h
@@ -5,6 +5,7 @@
 #ifndef __SOC_OTX2_ASM_H
 #define __SOC_OTX2_ASM_H
 
+#include <linux/types.h>
 #if defined(CONFIG_ARM64)
 /*
  * otx2_lmt_flush is used for LMT store operation.
-- 
cgit v1.2.3


From 78b727d0281507fabf954573420790b21e34aefe Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Fri, 17 Sep 2021 11:04:34 +0800
Subject: clk: qcom: smd-rpm: Add QCM2290 RPM clock support

Add support for RPM-managed clocks on the QCM2290 platform.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Link: https://lore.kernel.org/r/20210917030434.19859-4-shawn.guo@linaro.org
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/soc/qcom/smd-rpm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/smd-rpm.h b/include/linux/soc/qcom/smd-rpm.h
index 60e66fc9b6bf..860dd8cdf9f3 100644
--- a/include/linux/soc/qcom/smd-rpm.h
+++ b/include/linux/soc/qcom/smd-rpm.h
@@ -38,6 +38,8 @@ struct qcom_smd_rpm;
 #define QCOM_SMD_RPM_IPA_CLK	0x617069
 #define QCOM_SMD_RPM_CE_CLK	0x6563
 #define QCOM_SMD_RPM_AGGR_CLK	0x72676761
+#define QCOM_SMD_RPM_HWKM_CLK	0x6d6b7768
+#define QCOM_SMD_RPM_PKA_CLK	0x616b70
 
 int qcom_rpm_smd_write(struct qcom_smd_rpm *rpm,
 		       int state,
-- 
cgit v1.2.3


From 9974cb5c879048f5144d0660c4932d98176213c4 Mon Sep 17 00:00:00 2001
From: Chen Wandun <chenwandun@huawei.com>
Date: Wed, 13 Oct 2021 17:47:02 +0800
Subject: net: delete redundant function declaration

The implement of function netdev_all_upper_get_next_dev_rcu has been
removed in:
	commit f1170fd462c6 ("net: Remove all_adj_list and its references")
so delete redundant declaration in header file.

Fixes: f1170fd462c6 ("net: Remove all_adj_list and its references")
Signed-off-by: Chen Wandun <chenwandun@huawei.com>
Link: https://lore.kernel.org/r/20211013094702.3931071-1-chenwandun@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f33af341bfb2..173984414f38 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4801,8 +4801,6 @@ struct netdev_nested_priv {
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 						     struct list_head **iter);
-struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
-						     struct list_head **iter);
 
 #ifdef CONFIG_LOCKDEP
 static LIST_HEAD(net_unlink_list);
-- 
cgit v1.2.3


From f0ada6da3a0d69682e21f1783d02676e0fbf1bc1 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 13 Oct 2021 17:37:07 +0300
Subject: device property: Add missed header in fwnode.h

When adding some stuff to the header file we must not rely on
implicit dependencies that are happen by luck or bugs in other
headers. Hence fwnode.h needs to use bits.h directly.

Fixes: c2c724c868c4 ("driver core: Add fw_devlink_parse_fwtree()")
Cc: Saravana Kannan <saravanak@google.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20211013143707.80222-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/fwnode.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 9f4ad719bfe3..3a532ba66f6c 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -11,6 +11,7 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/bits.h>
 #include <linux/err.h>
 
 struct fwnode_operations;
-- 
cgit v1.2.3


From 5de62ea84abd732ded7c5569426fd71c0420f83e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Sep 2021 22:16:02 +0200
Subject: sched,livepatch: Use wake_up_if_idle()

Make sure to prod idle CPUs so they call klp_update_patch_state().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Vasily Gorbik <gor@linux.ibm.com>
Tested-by: Petr Mladek <pmladek@suse.com>
Tested-by: Vasily Gorbik <gor@linux.ibm.com> # on s390
Link: https://lkml.kernel.org/r/20210929151723.162004989@infradead.org
---
 include/linux/sched/idle.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index 22873d276be6..d73d314d59c6 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -11,7 +11,11 @@ enum cpu_idle_type {
 	CPU_MAX_IDLE_TYPES
 };
 
+#ifdef CONFIG_SMP
 extern void wake_up_if_idle(int cpu);
+#else
+static inline void wake_up_if_idle(int cpu) { }
+#endif
 
 /*
  * Idle thread specific functions to determine the need_resched
-- 
cgit v1.2.3


From 4ef0c5c6b5ba1f38f0ea1cedad0cad722f00c14a Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 15 Sep 2021 14:40:30 +0800
Subject: kernel/sched: Fix sched_fork() access an invalid sched_task_group

There is a small race between copy_process() and sched_fork()
where child->sched_task_group point to an already freed pointer.

	parent doing fork()      | someone moving the parent
				 | to another cgroup
  -------------------------------+-------------------------------
  copy_process()
      + dup_task_struct()<1>
				  parent move to another cgroup,
				  and free the old cgroup. <2>
      + sched_fork()
	+ __set_task_cpu()<3>
	+ task_fork_fair()
	  + sched_slice()<4>

In the worst case, this bug can lead to "use-after-free" and
cause panic as shown above:

  (1) parent copy its sched_task_group to child at <1>;

  (2) someone move the parent to another cgroup and free the old
      cgroup at <2>;

  (3) the sched_task_group and cfs_rq that belong to the old cgroup
      will be accessed at <3> and <4>, which cause a panic:

  [] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
  [] PGD 8000001fa0a86067 P4D 8000001fa0a86067 PUD 2029955067 PMD 0
  [] Oops: 0000 [#1] SMP PTI
  [] CPU: 7 PID: 648398 Comm: ebizzy Kdump: loaded Tainted: G           OE    --------- -  - 4.18.0.x86_64+ #1
  [] RIP: 0010:sched_slice+0x84/0xc0

  [] Call Trace:
  []  task_fork_fair+0x81/0x120
  []  sched_fork+0x132/0x240
  []  copy_process.part.5+0x675/0x20e0
  []  ? __handle_mm_fault+0x63f/0x690
  []  _do_fork+0xcd/0x3b0
  []  do_syscall_64+0x5d/0x1d0
  []  entry_SYSCALL_64_after_hwframe+0x65/0xca
  [] RIP: 0033:0x7f04418cd7e1

Between cgroup_can_fork() and cgroup_post_fork(), the cgroup
membership and thus sched_task_group can't change. So update child's
sched_task_group at sched_post_fork() and move task_fork() and
__set_task_cpu() (where accees the sched_task_group) from sched_fork()
to sched_post_fork().

Fixes: 8323f26ce342 ("sched: Fix race in task_group")
Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lkml.kernel.org/r/20210915064030.2231-1-zhangqiao22@huawei.com
---
 include/linux/sched/task.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ef02be869cf2..ba88a6987400 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -54,7 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_post_fork(struct task_struct *p);
+extern void sched_post_fork(struct task_struct *p,
+			    struct kernel_clone_args *kargs);
 extern void sched_dead(struct task_struct *p);
 
 void __noreturn do_task_dead(void);
-- 
cgit v1.2.3


From 804bccba71a57e7e5deb507a4c8ebbab730909c0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Sep 2021 19:54:50 -0700
Subject: sched: Fill unconditional hole induced by sched_entity

With struct sched_entity before the other sched entities, its alignment
won't induce a struct hole. This saves 64 bytes in defconfig task_struct:

Before:
	...
        unsigned int               rt_priority;          /*   120     4 */

        /* XXX 4 bytes hole, try to pack */

        /* --- cacheline 2 boundary (128 bytes) --- */
        const struct sched_class  * sched_class;         /*   128     8 */

        /* XXX 56 bytes hole, try to pack */

        /* --- cacheline 3 boundary (192 bytes) --- */
        struct sched_entity        se __attribute__((__aligned__(64))); /*   192   448 */
        /* --- cacheline 10 boundary (640 bytes) --- */
        struct sched_rt_entity     rt;                   /*   640    48 */
        struct sched_dl_entity     dl __attribute__((__aligned__(8))); /*   688   224 */
        /* --- cacheline 14 boundary (896 bytes) was 16 bytes ago --- */

After:
	...
        unsigned int               rt_priority;          /*   120     4 */

        /* XXX 4 bytes hole, try to pack */

        /* --- cacheline 2 boundary (128 bytes) --- */
        struct sched_entity        se __attribute__((__aligned__(64))); /*   128   448 */
        /* --- cacheline 9 boundary (576 bytes) --- */
        struct sched_rt_entity     rt;                   /*   576    48 */
        struct sched_dl_entity     dl __attribute__((__aligned__(8))); /*   624   224 */
        /* --- cacheline 13 boundary (832 bytes) was 16 bytes ago --- */

Summary diff:
-	/* size: 7040, cachelines: 110, members: 188 */
+	/* size: 6976, cachelines: 109, members: 188 */

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210924025450.4138503-1-keescook@chromium.org
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 193e16e2d0e4..343603f77f8b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -775,10 +775,10 @@ struct task_struct {
 	int				normal_prio;
 	unsigned int			rt_priority;
 
-	const struct sched_class	*sched_class;
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
 	struct sched_dl_entity		dl;
+	const struct sched_class	*sched_class;
 
 #ifdef CONFIG_SCHED_CORE
 	struct rb_node			core_node;
-- 
cgit v1.2.3


From e9bdcdbf6936dd1fbf419e00222dc9038b7812c2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Mon, 11 Oct 2021 15:32:44 +0200
Subject: pid: add pidfd_get_task() helper

The number of system calls making use of pidfds is constantly
increasing. Some of those new system calls duplicate the code to turn a
pidfd into task_struct it refers to. Give them a simple helper for this.

Link: https://lore.kernel.org/r/20211004125050.1153693-2-christian.brauner@ubuntu.com
Link: https://lore.kernel.org/r/20211011133245.1703103-2-brauner@kernel.org
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Matthew Bobrowski <repnop@google.com>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Reviewed-by: Matthew Bobrowski <repnop@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/pid.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index af308e15f174..343abf22092e 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -78,6 +78,7 @@ struct file;
 
 extern struct pid *pidfd_pid(const struct file *file);
 struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
+struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
 int pidfd_create(struct pid *pid, unsigned int flags);
 
 static inline struct pid *get_pid(struct pid *pid)
-- 
cgit v1.2.3


From 9f37ab0412eba537377c38b1dde1a04fbd7b5264 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 14 Oct 2021 14:18:59 +0000
Subject: PCI/switchtec: Add check of event support

Not all events are supported by every gen/variant of the Switchtec
firmware. To solve this, since Gen4, a new bit in each event header
is introduced to indicate if an event is supported by the firmware.

Link: https://lore.kernel.org/r/20211014141859.11444-6-kelvin.cao@microchip.com
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/switchtec.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 082f1d51957a..be24056ac00f 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -19,6 +19,7 @@
 #define SWITCHTEC_EVENT_EN_CLI   BIT(2)
 #define SWITCHTEC_EVENT_EN_IRQ   BIT(3)
 #define SWITCHTEC_EVENT_FATAL    BIT(4)
+#define SWITCHTEC_EVENT_NOT_SUPP BIT(31)
 
 #define SWITCHTEC_DMA_MRPC_EN	BIT(0)
 
-- 
cgit v1.2.3


From 766607570becbd26cab6d66a544dd8d0d964df5a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 14 Oct 2021 07:24:31 -0700
Subject: ethernet: constify references to netdev->dev_addr in drivers

This big patch sprinkles const on local variables and
function arguments which may refer to netdev->dev_addr.

Commit 406f42fa0d3c ("net-next: When a bond have a massive amount
of VLANs...") introduced a rbtree for faster Ethernet address look
up. To maintain netdev->dev_addr in this tree we need to make all
the writes to it got through appropriate helpers.

Some of the changes here are not strictly required - const
is sometimes cast off but pointer is not used for writing.
It seems like it's still better to add the const in case
the code changes later or relevant -W flags get enabled
for the build.

No functional changes.

Link: https://lore.kernel.org/r/20211014142432.449314-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/qed/qed_eth_if.h  | 2 +-
 include/linux/qed/qed_if.h      | 2 +-
 include/linux/qed/qed_rdma_if.h | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 4df0bf0a0864..e1bf3219b4e6 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -331,7 +331,7 @@ struct qed_eth_ops {
 	int (*configure_arfs_searcher)(struct qed_dev *cdev,
 				       enum qed_filter_config_mode mode);
 	int (*get_coalesce)(struct qed_dev *cdev, u16 *coal, void *handle);
-	int (*req_bulletin_update_mac)(struct qed_dev *cdev, u8 *mac);
+	int (*req_bulletin_update_mac)(struct qed_dev *cdev, const u8 *mac);
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void);
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index ad220d5da18f..0dae7fcc5ef2 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -1113,7 +1113,7 @@ struct qed_common_ops {
  *
  * Return: Int.
  */
-	int (*update_mac)(struct qed_dev *cdev, u8 *mac);
+	int (*update_mac)(struct qed_dev *cdev, const u8 *mac);
 
 /**
  * update_mtu(): API to inform the change in the mtu.
diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h
index aeb242cefebf..3b76c07fbcf8 100644
--- a/include/linux/qed/qed_rdma_if.h
+++ b/include/linux/qed/qed_rdma_if.h
@@ -662,7 +662,8 @@ struct qed_rdma_ops {
 			     u8 connection_handle,
 			     struct qed_ll2_stats *p_stats);
 	int (*ll2_set_mac_filter)(struct qed_dev *cdev,
-				  u8 *old_mac_address, u8 *new_mac_address);
+				  u8 *old_mac_address,
+				  const u8 *new_mac_address);
 
 	int (*iwarp_set_engine_affin)(struct qed_dev *cdev, bool b_reset);
 
-- 
cgit v1.2.3


From 54f2d8d6ca99a3e24efc9c8b055bbcbe0b583227 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 13 Oct 2021 13:44:30 -0700
Subject: ethernet: make eth_hw_addr_random() use dev_addr_set()

Commit 406f42fa0d3c ("net-next: When a bond have a massive amount
of VLANs...") introduced a rbtree for faster Ethernet address look
up. To maintain netdev->dev_addr in this tree we need to make all
the writes to it got through appropriate helpers.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/etherdevice.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 3cf546d2ffd1..76f7ff684cbf 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -269,8 +269,11 @@ static inline void eth_zero_addr(u8 *addr)
  */
 static inline void eth_hw_addr_random(struct net_device *dev)
 {
+	u8 addr[ETH_ALEN];
+
+	eth_random_addr(addr);
+	__dev_addr_set(dev, addr, ETH_ALEN);
 	dev->addr_assign_type = NET_ADDR_RANDOM;
-	eth_random_addr(dev->dev_addr);
 }
 
 /**
-- 
cgit v1.2.3


From ba530fea8ca1b57ee71d4e62f287a5d7ed92f789 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 13 Oct 2021 13:54:50 -0700
Subject: ethernet: remove random_ether_addr()

random_ether_addr() was the original name of the helper which
was kept for backward compatibility (?) after the rename in
commit 0a4dd594982a ("etherdevice: Rename random_ether_addr
to eth_random_addr").

We have a single random_ether_addr() caller left in tree
while there are 70 callers of eth_random_addr().
Time to drop this define.

Reviewed-by: Simon Horman <simon.horman@corigine.com>
Link: https://lore.kernel.org/r/20211013205450.328092-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/etherdevice.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 76f7ff684cbf..23681c3d3b8a 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -234,8 +234,6 @@ static inline void eth_random_addr(u8 *addr)
 	addr[0] |= 0x02;	/* set local assignment bit (IEEE802) */
 }
 
-#define random_ether_addr(addr) eth_random_addr(addr)
-
 /**
  * eth_broadcast_addr - Assign broadcast address
  * @addr: Pointer to a six-byte array containing the Ethernet address
-- 
cgit v1.2.3


From 7463acfbe52ae8b7e0ea6890c1886b3f8ba8bddd Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Fri, 8 Oct 2021 22:06:01 +0200
Subject: netfilter: Rename ingress hook include file

Prepare for addition of a netfilter egress hook by renaming
<linux/netfilter_ingress.h> to <linux/netfilter_netdev.h>.

The egress hook also necessitates a refactoring of the include file,
but that is done in a separate commit to ease reviewing.

No functional change intended.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ingress.h | 58 ---------------------------------------
 include/linux/netfilter_netdev.h  | 58 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 58 deletions(-)
 delete mode 100644 include/linux/netfilter_ingress.h
 create mode 100644 include/linux/netfilter_netdev.h

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
deleted file mode 100644
index a13774be2eb5..000000000000
--- a/include/linux/netfilter_ingress.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NETFILTER_INGRESS_H_
-#define _NETFILTER_INGRESS_H_
-
-#include <linux/netfilter.h>
-#include <linux/netdevice.h>
-
-#ifdef CONFIG_NETFILTER_INGRESS
-static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
-{
-#ifdef CONFIG_JUMP_LABEL
-	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
-		return false;
-#endif
-	return rcu_access_pointer(skb->dev->nf_hooks_ingress);
-}
-
-/* caller must hold rcu_read_lock */
-static inline int nf_hook_ingress(struct sk_buff *skb)
-{
-	struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
-	struct nf_hook_state state;
-	int ret;
-
-	/* Must recheck the ingress hook head, in the event it became NULL
-	 * after the check in nf_hook_ingress_active evaluated to true.
-	 */
-	if (unlikely(!e))
-		return 0;
-
-	nf_hook_state_init(&state, NF_NETDEV_INGRESS,
-			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
-			   dev_net(skb->dev), NULL);
-	ret = nf_hook_slow(skb, &state, e, 0);
-	if (ret == 0)
-		return -1;
-
-	return ret;
-}
-
-static inline void nf_hook_ingress_init(struct net_device *dev)
-{
-	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
-}
-#else /* CONFIG_NETFILTER_INGRESS */
-static inline int nf_hook_ingress_active(struct sk_buff *skb)
-{
-	return 0;
-}
-
-static inline int nf_hook_ingress(struct sk_buff *skb)
-{
-	return 0;
-}
-
-static inline void nf_hook_ingress_init(struct net_device *dev) {}
-#endif /* CONFIG_NETFILTER_INGRESS */
-#endif /* _NETFILTER_INGRESS_H_ */
diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h
new file mode 100644
index 000000000000..a13774be2eb5
--- /dev/null
+++ b/include/linux/netfilter_netdev.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NETFILTER_INGRESS_H_
+#define _NETFILTER_INGRESS_H_
+
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NETFILTER_INGRESS
+static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
+{
+#ifdef CONFIG_JUMP_LABEL
+	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
+		return false;
+#endif
+	return rcu_access_pointer(skb->dev->nf_hooks_ingress);
+}
+
+/* caller must hold rcu_read_lock */
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+	struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
+	struct nf_hook_state state;
+	int ret;
+
+	/* Must recheck the ingress hook head, in the event it became NULL
+	 * after the check in nf_hook_ingress_active evaluated to true.
+	 */
+	if (unlikely(!e))
+		return 0;
+
+	nf_hook_state_init(&state, NF_NETDEV_INGRESS,
+			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
+			   dev_net(skb->dev), NULL);
+	ret = nf_hook_slow(skb, &state, e, 0);
+	if (ret == 0)
+		return -1;
+
+	return ret;
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev)
+{
+	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
+}
+#else /* CONFIG_NETFILTER_INGRESS */
+static inline int nf_hook_ingress_active(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev) {}
+#endif /* CONFIG_NETFILTER_INGRESS */
+#endif /* _NETFILTER_INGRESS_H_ */
-- 
cgit v1.2.3


From 17d20784223d52bf1671f984c9e8d5d9b8ea171b Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Fri, 8 Oct 2021 22:06:02 +0200
Subject: netfilter: Generalize ingress hook include file

Prepare for addition of a netfilter egress hook by generalizing the
ingress hook include file.

No functional change intended.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_netdev.h | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h
index a13774be2eb5..5812b0fb0278 100644
--- a/include/linux/netfilter_netdev.h
+++ b/include/linux/netfilter_netdev.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NETFILTER_INGRESS_H_
-#define _NETFILTER_INGRESS_H_
+#ifndef _NETFILTER_NETDEV_H_
+#define _NETFILTER_NETDEV_H_
 
 #include <linux/netfilter.h>
 #include <linux/netdevice.h>
@@ -38,10 +38,6 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 	return ret;
 }
 
-static inline void nf_hook_ingress_init(struct net_device *dev)
-{
-	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
-}
 #else /* CONFIG_NETFILTER_INGRESS */
 static inline int nf_hook_ingress_active(struct sk_buff *skb)
 {
@@ -52,7 +48,13 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 {
 	return 0;
 }
-
-static inline void nf_hook_ingress_init(struct net_device *dev) {}
 #endif /* CONFIG_NETFILTER_INGRESS */
-#endif /* _NETFILTER_INGRESS_H_ */
+
+static inline void nf_hook_netdev_init(struct net_device *dev)
+{
+#ifdef CONFIG_NETFILTER_INGRESS
+	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
+#endif
+}
+
+#endif /* _NETFILTER_NETDEV_H_ */
-- 
cgit v1.2.3


From 42df6e1d221dddc0f2acf2be37e68d553ad65f96 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Fri, 8 Oct 2021 22:06:03 +0200
Subject: netfilter: Introduce egress hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support classifying packets with netfilter on egress to satisfy user
requirements such as:
* outbound security policies for containers (Laura)
* filtering and mangling intra-node Direct Server Return (DSR) traffic
  on a load balancer (Laura)
* filtering locally generated traffic coming in through AF_PACKET,
  such as local ARP traffic generated for clustering purposes or DHCP
  (Laura; the AF_PACKET plumbing is contained in a follow-up commit)
* L2 filtering from ingress and egress for AVB (Audio Video Bridging)
  and gPTP with nftables (Pablo)
* in the future: in-kernel NAT64/NAT46 (Pablo)

The egress hook introduced herein complements the ingress hook added by
commit e687ad60af09 ("netfilter: add netfilter ingress hook after
handle_ing() under unique static key").  A patch for nftables to hook up
egress rules from user space has been submitted separately, so users may
immediately take advantage of the feature.

Alternatively or in addition to netfilter, packets can be classified
with traffic control (tc).  On ingress, packets are classified first by
tc, then by netfilter.  On egress, the order is reversed for symmetry.
Conceptually, tc and netfilter can be thought of as layers, with
netfilter layered above tc.

Traffic control is capable of redirecting packets to another interface
(man 8 tc-mirred).  E.g., an ingress packet may be redirected from the
host namespace to a container via a veth connection:
tc ingress (host) -> tc egress (veth host) -> tc ingress (veth container)

In this case, netfilter egress classifying is not performed when leaving
the host namespace!  That's because the packet is still on the tc layer.
If tc redirects the packet to a physical interface in the host namespace
such that it leaves the system, the packet is never subjected to
netfilter egress classifying.  That is only logical since it hasn't
passed through netfilter ingress classifying either.

Packets can alternatively be redirected at the netfilter layer using
nft fwd.  Such a packet *is* subjected to netfilter egress classifying
since it has reached the netfilter layer.

Internally, the skb->nf_skip_egress flag controls whether netfilter is
invoked on egress by __dev_queue_xmit().  Because __dev_queue_xmit() may
be called recursively by tunnel drivers such as vxlan, the flag is
reverted to false after sch_handle_egress().  This ensures that
netfilter is applied both on the overlay and underlying network.

Interaction between tc and netfilter is possible by setting and querying
skb->mark.

If netfilter egress classifying is not enabled on any interface, it is
patched out of the data path by way of a static_key and doesn't make a
performance difference that is discernible from noise:

Before:             1537 1538 1538 1537 1538 1537 Mb/sec
After:              1536 1534 1539 1539 1539 1540 Mb/sec
Before + tc accept: 1418 1418 1418 1419 1419 1418 Mb/sec
After  + tc accept: 1419 1424 1418 1419 1422 1420 Mb/sec
Before + tc drop:   1620 1619 1619 1619 1620 1620 Mb/sec
After  + tc drop:   1616 1624 1625 1624 1622 1619 Mb/sec

When netfilter egress classifying is enabled on at least one interface,
a minimal performance penalty is incurred for every egress packet, even
if the interface it's transmitted over doesn't have any netfilter egress
rules configured.  That is caused by checking dev->nf_hooks_egress
against NULL.

Measurements were performed on a Core i7-3615QM.  Commands to reproduce:
ip link add dev foo type dummy
ip link set dev foo up
modprobe pktgen
echo "add_device foo" > /proc/net/pktgen/kpktgend_3
samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i foo -n 400000000 -m "11:11:11:11:11:11" -d 1.1.1.1

Accept all traffic with tc:
tc qdisc add dev foo clsact
tc filter add dev foo egress bpf da bytecode '1,6 0 0 0,'

Drop all traffic with tc:
tc qdisc add dev foo clsact
tc filter add dev foo egress bpf da bytecode '1,6 0 0 2,'

Apply this patch when measuring packet drops to avoid errors in dmesg:
https://lore.kernel.org/netdev/a73dda33-57f4-95d8-ea51-ed483abd6a7a@iogearbox.net/

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: Laura García Liébana <nevola@gmail.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netdevice.h        |  4 ++
 include/linux/netfilter_netdev.h | 86 ++++++++++++++++++++++++++++++++++++++++
 include/linux/skbuff.h           |  4 ++
 3 files changed, 94 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d79163208dfd..e9a48068f306 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1861,6 +1861,7 @@ enum netdev_ml_priv_type {
  *	@xps_maps:	XXX: need comments on this one
  *	@miniq_egress:		clsact qdisc specific data for
  *				egress processing
+ *	@nf_hooks_egress:	netfilter hooks executed for egress packets
  *	@qdisc_hash:		qdisc hash table
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog (see dev_watchdog())
@@ -2161,6 +2162,9 @@ struct net_device {
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc __rcu	*miniq_egress;
 #endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	struct nf_hook_entries __rcu *nf_hooks_egress;
+#endif
 
 #ifdef CONFIG_NET_SCHED
 	DECLARE_HASHTABLE	(qdisc_hash, 4);
diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h
index 5812b0fb0278..b71b57a83bb4 100644
--- a/include/linux/netfilter_netdev.h
+++ b/include/linux/netfilter_netdev.h
@@ -50,11 +50,97 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 }
 #endif /* CONFIG_NETFILTER_INGRESS */
 
+#ifdef CONFIG_NETFILTER_EGRESS
+static inline bool nf_hook_egress_active(void)
+{
+#ifdef CONFIG_JUMP_LABEL
+	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS]))
+		return false;
+#endif
+	return true;
+}
+
+/**
+ * nf_hook_egress - classify packets before transmission
+ * @skb: packet to be classified
+ * @rc: result code which shall be returned by __dev_queue_xmit() on failure
+ * @dev: netdev whose egress hooks shall be applied to @skb
+ *
+ * Returns @skb on success or %NULL if the packet was consumed or filtered.
+ * Caller must hold rcu_read_lock.
+ *
+ * On ingress, packets are classified first by tc, then by netfilter.
+ * On egress, the order is reversed for symmetry.  Conceptually, tc and
+ * netfilter can be thought of as layers, with netfilter layered above tc:
+ * When tc redirects a packet to another interface, netfilter is not applied
+ * because the packet is on the tc layer.
+ *
+ * The nf_skip_egress flag controls whether netfilter is applied on egress.
+ * It is updated by __netif_receive_skb_core() and __dev_queue_xmit() when the
+ * packet passes through tc and netfilter.  Because __dev_queue_xmit() may be
+ * called recursively by tunnel drivers such as vxlan, the flag is reverted to
+ * false after sch_handle_egress().  This ensures that netfilter is applied
+ * both on the overlay and underlying network.
+ */
+static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
+					     struct net_device *dev)
+{
+	struct nf_hook_entries *e;
+	struct nf_hook_state state;
+	int ret;
+
+#ifdef CONFIG_NETFILTER_SKIP_EGRESS
+	if (skb->nf_skip_egress)
+		return skb;
+#endif
+
+	e = rcu_dereference(dev->nf_hooks_egress);
+	if (!e)
+		return skb;
+
+	nf_hook_state_init(&state, NF_NETDEV_EGRESS,
+			   NFPROTO_NETDEV, dev, NULL, NULL,
+			   dev_net(dev), NULL);
+	ret = nf_hook_slow(skb, &state, e, 0);
+
+	if (ret == 1) {
+		return skb;
+	} else if (ret < 0) {
+		*rc = NET_XMIT_DROP;
+		return NULL;
+	} else { /* ret == 0 */
+		*rc = NET_XMIT_SUCCESS;
+		return NULL;
+	}
+}
+#else /* CONFIG_NETFILTER_EGRESS */
+static inline bool nf_hook_egress_active(void)
+{
+	return false;
+}
+
+static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
+					     struct net_device *dev)
+{
+	return skb;
+}
+#endif /* CONFIG_NETFILTER_EGRESS */
+
+static inline void nf_skip_egress(struct sk_buff *skb, bool skip)
+{
+#ifdef CONFIG_NETFILTER_SKIP_EGRESS
+	skb->nf_skip_egress = skip;
+#endif
+}
+
 static inline void nf_hook_netdev_init(struct net_device *dev)
 {
 #ifdef CONFIG_NETFILTER_INGRESS
 	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
 #endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	RCU_INIT_POINTER(dev->nf_hooks_egress, NULL);
+#endif
 }
 
 #endif /* _NETFILTER_NETDEV_H_ */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 841e2f0f5240..cb96f1e6460c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -652,6 +652,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@tc_at_ingress: used within tc_classify to distinguish in/egress
  *	@redirected: packet was redirected by packet classifier
  *	@from_ingress: packet was redirected from the ingress path
+ *	@nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h
  *	@peeked: this packet has been seen already, so stats have been
  *		done for it, don't do them again
  *	@nf_trace: netfilter packet trace flag
@@ -868,6 +869,9 @@ struct sk_buff {
 #ifdef CONFIG_NET_REDIRECT
 	__u8			from_ingress:1;
 #endif
+#ifdef CONFIG_NETFILTER_SKIP_EGRESS
+	__u8			nf_skip_egress:1;
+#endif
 #ifdef CONFIG_TLS_DEVICE
 	__u8			decrypted:1;
 #endif
-- 
cgit v1.2.3


From 8844e01062ddd8196c4550df9803cc1835d123c2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Oct 2021 17:15:10 +0200
Subject: netfilter: iptables: allow use of ipt_do_table as hookfn

This is possible now that the xt_table structure is passed in via *priv.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv4/ip_tables.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index 8d09bfe850dc..132b0e4a6d4d 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -63,9 +63,9 @@ struct ipt_error {
 }
 
 extern void *ipt_alloc_initial_table(const struct xt_table *);
-extern unsigned int ipt_do_table(struct sk_buff *skb,
-				 const struct nf_hook_state *state,
-				 struct xt_table *table);
+extern unsigned int ipt_do_table(void *priv,
+				 struct sk_buff *skb,
+				 const struct nf_hook_state *state);
 
 #ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 #include <net/compat.h>
-- 
cgit v1.2.3


From e8d225b6002673366abc2e40e30c991bdc8d62ca Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Oct 2021 17:15:12 +0200
Subject: netfilter: arp_tables: allow use of arpt_do_table as hookfn

This is possible now that the xt_table structure is passed in via *priv.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_arp/arp_tables.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index 4f9a4b3c5892..a40aaf645fa4 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -54,9 +54,8 @@ int arpt_register_table(struct net *net, const struct xt_table *table,
 			const struct nf_hook_ops *ops);
 void arpt_unregister_table(struct net *net, const char *name);
 void arpt_unregister_table_pre_exit(struct net *net, const char *name);
-extern unsigned int arpt_do_table(struct sk_buff *skb,
-				  const struct nf_hook_state *state,
-				  struct xt_table *table);
+extern unsigned int arpt_do_table(void *priv, struct sk_buff *skb,
+				  const struct nf_hook_state *state);
 
 #ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 #include <net/compat.h>
-- 
cgit v1.2.3


From 44b5990e7b463240e4c116c9e8670c67dad960cc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Oct 2021 17:15:13 +0200
Subject: netfilter: ip6tables: allow use of ip6t_do_table as hookfn

This is possible now that the xt_table structure is passed via *priv.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6/ip6_tables.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 79e73fd7d965..8b8885a73c76 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -29,9 +29,8 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
 			const struct nf_hook_ops *ops);
 void ip6t_unregister_table_pre_exit(struct net *net, const char *name);
 void ip6t_unregister_table_exit(struct net *net, const char *name);
-extern unsigned int ip6t_do_table(struct sk_buff *skb,
-				  const struct nf_hook_state *state,
-				  struct xt_table *table);
+extern unsigned int ip6t_do_table(void *priv, struct sk_buff *skb,
+				  const struct nf_hook_state *state);
 
 #ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 #include <net/compat.h>
-- 
cgit v1.2.3


From f0d6764f7ddbf6d57302155b0f83eadb25ab0f0c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Oct 2021 17:15:14 +0200
Subject: netfilter: ebtables: allow use of ebt_do_table as hookfn

This is possible now that the xt_table structure is passed via *priv.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge/ebtables.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 10a01978bc0d..a13296d6c7ce 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -112,9 +112,8 @@ extern int ebt_register_table(struct net *net,
 			      const struct nf_hook_ops *ops);
 extern void ebt_unregister_table(struct net *net, const char *tablename);
 void ebt_unregister_table_pre_exit(struct net *net, const char *tablename);
-extern unsigned int ebt_do_table(struct sk_buff *skb,
-				 const struct nf_hook_state *state,
-				 struct ebt_table *table);
+extern unsigned int ebt_do_table(void *priv, struct sk_buff *skb,
+				 const struct nf_hook_state *state);
 
 /* True if the hook mask denotes that the rule is in a base chain,
  * used in the check() functions */
-- 
cgit v1.2.3


From 8b017fbe0bbb98dd71fb4850f6b9cc0e136a26b8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 14 Oct 2021 11:00:37 +0200
Subject: net: of: fix stub of_net helpers for CONFIG_NET=n

Moving the of_net code from drivers/of/ to net/core means we
no longer stub out the helpers when networking is disabled,
which leads to a randconfig build failure with at least one
ARM platform that calls this from non-networking code:

arm-linux-gnueabi-ld: arch/arm/mach-mvebu/kirkwood.o: in function `kirkwood_dt_eth_fixup':
kirkwood.c:(.init.text+0x54): undefined reference to `of_get_mac_address'

Restore the way this worked before by changing that #ifdef
check back to testing for both CONFIG_OF and CONFIG_NET.

Fixes: e330fb14590c ("of: net: move of_net under net/")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20211014090055.2058949-1-arnd@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/of_net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of_net.h b/include/linux/of_net.h
index 0797e2edb8c2..0484b613ca64 100644
--- a/include/linux/of_net.h
+++ b/include/linux/of_net.h
@@ -8,7 +8,7 @@
 
 #include <linux/phy.h>
 
-#ifdef CONFIG_OF
+#if defined(CONFIG_OF) && defined(CONFIG_NET)
 #include <linux/of.h>
 
 struct net_device;
-- 
cgit v1.2.3


From 52f88693378a58094c538662ba652aff0253c4fe Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Tue, 12 Oct 2021 09:56:13 -0700
Subject: binder: use cred instead of task for selinux checks

Since binder was integrated with selinux, it has passed
'struct task_struct' associated with the binder_proc
to represent the source and target of transactions.
The conversion of task to SID was then done in the hook
implementations. It turns out that there are race conditions
which can result in an incorrect security context being used.

Fix by using the 'struct cred' saved during binder_open and pass
it to the selinux subsystem.

Cc: stable@vger.kernel.org # 5.14 (need backport for earlier stables)
Fixes: 79af73079d75 ("Add security hooks to binder and implement the hooks for SELinux.")
Suggested-by: Jann Horn <jannh@google.com>
Signed-off-by: Todd Kjos <tkjos@google.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 14 +++++++-------
 include/linux/lsm_hooks.h     | 14 +++++++-------
 include/linux/security.h      | 28 ++++++++++++++--------------
 3 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index b3c525353769..4c7ed0268ce3 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -26,13 +26,13 @@
  *   #undef LSM_HOOK
  * };
  */
-LSM_HOOK(int, 0, binder_set_context_mgr, struct task_struct *mgr)
-LSM_HOOK(int, 0, binder_transaction, struct task_struct *from,
-	 struct task_struct *to)
-LSM_HOOK(int, 0, binder_transfer_binder, struct task_struct *from,
-	 struct task_struct *to)
-LSM_HOOK(int, 0, binder_transfer_file, struct task_struct *from,
-	 struct task_struct *to, struct file *file)
+LSM_HOOK(int, 0, binder_set_context_mgr, const struct cred *mgr)
+LSM_HOOK(int, 0, binder_transaction, const struct cred *from,
+	 const struct cred *to)
+LSM_HOOK(int, 0, binder_transfer_binder, const struct cred *from,
+	 const struct cred *to)
+LSM_HOOK(int, 0, binder_transfer_file, const struct cred *from,
+	 const struct cred *to, struct file *file)
 LSM_HOOK(int, 0, ptrace_access_check, struct task_struct *child,
 	 unsigned int mode)
 LSM_HOOK(int, 0, ptrace_traceme, struct task_struct *parent)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 0eb0ae95c4c4..528554e9b90c 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1313,22 +1313,22 @@
  *
  * @binder_set_context_mgr:
  *	Check whether @mgr is allowed to be the binder context manager.
- *	@mgr contains the task_struct for the task being registered.
+ *	@mgr contains the struct cred for the current binder process.
  *	Return 0 if permission is granted.
  * @binder_transaction:
  *	Check whether @from is allowed to invoke a binder transaction call
  *	to @to.
- *	@from contains the task_struct for the sending task.
- *	@to contains the task_struct for the receiving task.
+ *	@from contains the struct cred for the sending process.
+ *	@to contains the struct cred for the receiving process.
  * @binder_transfer_binder:
  *	Check whether @from is allowed to transfer a binder reference to @to.
- *	@from contains the task_struct for the sending task.
- *	@to contains the task_struct for the receiving task.
+ *	@from contains the struct cred for the sending process.
+ *	@to contains the struct cred for the receiving process.
  * @binder_transfer_file:
  *	Check whether @from is allowed to transfer @file to @to.
- *	@from contains the task_struct for the sending task.
+ *	@from contains the struct cred for the sending process.
  *	@file contains the struct file being transferred.
- *	@to contains the task_struct for the receiving task.
+ *	@to contains the struct cred for the receiving process.
  *
  * @ptrace_access_check:
  *	Check permission before allowing the current process to trace the
diff --git a/include/linux/security.h b/include/linux/security.h
index 7979b9629a42..9be72166e859 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -258,13 +258,13 @@ extern int security_init(void);
 extern int early_security_init(void);
 
 /* Security operations */
-int security_binder_set_context_mgr(struct task_struct *mgr);
-int security_binder_transaction(struct task_struct *from,
-				struct task_struct *to);
-int security_binder_transfer_binder(struct task_struct *from,
-				    struct task_struct *to);
-int security_binder_transfer_file(struct task_struct *from,
-				  struct task_struct *to, struct file *file);
+int security_binder_set_context_mgr(const struct cred *mgr);
+int security_binder_transaction(const struct cred *from,
+				const struct cred *to);
+int security_binder_transfer_binder(const struct cred *from,
+				    const struct cred *to);
+int security_binder_transfer_file(const struct cred *from,
+				  const struct cred *to, struct file *file);
 int security_ptrace_access_check(struct task_struct *child, unsigned int mode);
 int security_ptrace_traceme(struct task_struct *parent);
 int security_capget(struct task_struct *target,
@@ -508,25 +508,25 @@ static inline int early_security_init(void)
 	return 0;
 }
 
-static inline int security_binder_set_context_mgr(struct task_struct *mgr)
+static inline int security_binder_set_context_mgr(const struct cred *mgr)
 {
 	return 0;
 }
 
-static inline int security_binder_transaction(struct task_struct *from,
-					      struct task_struct *to)
+static inline int security_binder_transaction(const struct cred *from,
+					      const struct cred *to)
 {
 	return 0;
 }
 
-static inline int security_binder_transfer_binder(struct task_struct *from,
-						  struct task_struct *to)
+static inline int security_binder_transfer_binder(const struct cred *from,
+						  const struct cred *to)
 {
 	return 0;
 }
 
-static inline int security_binder_transfer_file(struct task_struct *from,
-						struct task_struct *to,
+static inline int security_binder_transfer_file(const struct cred *from,
+						const struct cred *to,
 						struct file *file)
 {
 	return 0;
-- 
cgit v1.2.3


From 4d5b5539742d2554591751b4248b0204d20dcc9d Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Tue, 12 Oct 2021 09:56:14 -0700
Subject: binder: use cred instead of task for getsecid

Use the 'struct cred' saved at binder_open() to lookup
the security ID via security_cred_getsecid(). This
ensures that the security context that opened binder
is the one used to generate the secctx.

Cc: stable@vger.kernel.org # 5.4+
Fixes: ec74136ded79 ("binder: create node flag to request sender's security context")
Signed-off-by: Todd Kjos <tkjos@google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 9be72166e859..cc6d39358336 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1041,6 +1041,11 @@ static inline void security_transfer_creds(struct cred *new,
 {
 }
 
+static inline void security_cred_getsecid(const struct cred *c, u32 *secid)
+{
+	*secid = 0;
+}
+
 static inline int security_kernel_act_as(struct cred *cred, u32 secid)
 {
 	return 0;
-- 
cgit v1.2.3


From c072c4ef7ef09e1d6470c48cf52570487589b76a Mon Sep 17 00:00:00 2001
From: Sam Protsenko <semen.protsenko@linaro.org>
Date: Thu, 14 Oct 2021 16:35:06 +0300
Subject: soc: samsung: exynos-chipid: Pass revision reg offsets

Old Exynos SoCs have both Product ID and Revision ID in one single
register, while new SoCs tend to have two separate registers for those
IDs. Implement handling of both cases by passing Revision ID register
offsets in driver data.

Previously existing macros for Exynos4210 (removed in this patch) were
incorrect:

    #define EXYNOS_SUBREV_MASK         (0xf << 4)
    #define EXYNOS_MAINREV_MASK        (0xf << 0)

Actual format of PRO_ID register in Exynos4210 (offset 0x0):

    [31:12] Product ID
      [9:8] Package information
      [7:4] Main Revision Number
      [3:0] Sub Revision Number

This patch doesn't change the behavior on existing platforms, so
'/sys/devices/soc0/revision' will show the same string as before.

Signed-off-by: Sam Protsenko <semen.protsenko@linaro.org>
Tested-by: Henrik Grimler <henrik@grimler.se>
Link: https://lore.kernel.org/r/20211014133508.1210-1-semen.protsenko@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
---
 include/linux/soc/samsung/exynos-chipid.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/samsung/exynos-chipid.h b/include/linux/soc/samsung/exynos-chipid.h
index 8bca6763f99c..62f0e2531068 100644
--- a/include/linux/soc/samsung/exynos-chipid.h
+++ b/include/linux/soc/samsung/exynos-chipid.h
@@ -9,10 +9,8 @@
 #define __LINUX_SOC_EXYNOS_CHIPID_H
 
 #define EXYNOS_CHIPID_REG_PRO_ID	0x00
-#define EXYNOS_SUBREV_MASK		(0xf << 4)
-#define EXYNOS_MAINREV_MASK		(0xf << 0)
-#define EXYNOS_REV_MASK			(EXYNOS_SUBREV_MASK | \
-					 EXYNOS_MAINREV_MASK)
+#define EXYNOS_REV_PART_MASK		0xf
+#define EXYNOS_REV_PART_SHIFT		4
 #define EXYNOS_MASK			0xfffff000
 
 #define EXYNOS_CHIPID_REG_PKG_ID	0x04
-- 
cgit v1.2.3


From 42a20f86dc19f9282d974df0ba4d226c865ab9dd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 29 Sep 2021 15:02:14 -0700
Subject: sched: Add wrapper for get_wchan() to keep task blocked

Having a stable wchan means the process must be blocked and for it to
stay that way while performing stack unwinding.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk> [arm]
Tested-by: Mark Rutland <mark.rutland@arm.com> [arm64]
Link: https://lkml.kernel.org/r/20211008111626.332092234@infradead.org
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 343603f77f8b..5f8db54226af 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2139,6 +2139,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 #endif /* CONFIG_SMP */
 
 extern bool sched_task_on_rq(struct task_struct *p);
+extern unsigned long get_wchan(struct task_struct *p);
 
 /*
  * In order to reduce various lock holder preemption latencies provide an
-- 
cgit v1.2.3


From c5e22feffdd736cb02b98b0f5b375c8ebc858dd4 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Fri, 24 Sep 2021 20:51:02 +1200
Subject: topology: Represent clusters of CPUs within a die

Both ACPI and DT provide the ability to describe additional layers of
topology between that of individual cores and higher level constructs
such as the level at which the last level cache is shared.
In ACPI this can be represented in PPTT as a Processor Hierarchy
Node Structure [1] that is the parent of the CPU cores and in turn
has a parent Processor Hierarchy Nodes Structure representing
a higher level of topology.

For example Kunpeng 920 has 6 or 8 clusters in each NUMA node, and each
cluster has 4 cpus. All clusters share L3 cache data, but each cluster
has local L3 tag. On the other hand, each clusters will share some
internal system bus.

+-----------------------------------+                          +---------+
|  +------+    +------+             +--------------------------+         |
|  | CPU0 |    | cpu1 |             |    +-----------+         |         |
|  +------+    +------+             |    |           |         |         |
|                                   +----+    L3     |         |         |
|  +------+    +------+   cluster   |    |    tag    |         |         |
|  | CPU2 |    | CPU3 |             |    |           |         |         |
|  +------+    +------+             |    +-----------+         |         |
|                                   |                          |         |
+-----------------------------------+                          |         |
+-----------------------------------+                          |         |
|  +------+    +------+             +--------------------------+         |
|  |      |    |      |             |    +-----------+         |         |
|  +------+    +------+             |    |           |         |         |
|                                   |    |    L3     |         |         |
|  +------+    +------+             +----+    tag    |         |         |
|  |      |    |      |             |    |           |         |         |
|  +------+    +------+             |    +-----------+         |         |
|                                   |                          |         |
+-----------------------------------+                          |   L3    |
                                                               |   data  |
+-----------------------------------+                          |         |
|  +------+    +------+             |    +-----------+         |         |
|  |      |    |      |             |    |           |         |         |
|  +------+    +------+             +----+    L3     |         |         |
|                                   |    |    tag    |         |         |
|  +------+    +------+             |    |           |         |         |
|  |      |    |      |             |    +-----------+         |         |
|  +------+    +------+             +--------------------------+         |
+-----------------------------------|                          |         |
+-----------------------------------|                          |         |
|  +------+    +------+             +--------------------------+         |
|  |      |    |      |             |    +-----------+         |         |
|  +------+    +------+             |    |           |         |         |
|                                   +----+    L3     |         |         |
|  +------+    +------+             |    |    tag    |         |         |
|  |      |    |      |             |    |           |         |         |
|  +------+    +------+             |    +-----------+         |         |
|                                   |                          |         |
+-----------------------------------+                          |         |
+-----------------------------------+                          |         |
|  +------+    +------+             +--------------------------+         |
|  |      |    |      |             |   +-----------+          |         |
|  +------+    +------+             |   |           |          |         |
|                                   |   |    L3     |          |         |
|  +------+    +------+             +---+    tag    |          |         |
|  |      |    |      |             |   |           |          |         |
|  +------+    +------+             |   +-----------+          |         |
|                                   |                          |         |
+-----------------------------------+                          |         |
+-----------------------------------+                          |         |
|  +------+    +------+             +--------------------------+         |
|  |      |    |      |             |  +-----------+           |         |
|  +------+    +------+             |  |           |           |         |
|                                   |  |    L3     |           |         |
|  +------+    +------+             +--+    tag    |           |         |
|  |      |    |      |             |  |           |           |         |
|  +------+    +------+             |  +-----------+           |         |
|                                   |                          +---------+
+-----------------------------------+

That means spreading tasks among clusters will bring more bandwidth
while packing tasks within one cluster will lead to smaller cache
synchronization latency. So both kernel and userspace will have
a chance to leverage this topology to deploy tasks accordingly to
achieve either smaller cache latency within one cluster or an even
distribution of load among clusters for higher throughput.

This patch exposes cluster topology to both kernel and userspace.
Libraried like hwloc will know cluster by cluster_cpus and related
sysfs attributes. PoC of HWLOC support at [2].

Note this patch only handle the ACPI case.

Special consideration is needed for SMT processors, where it is
necessary to move 2 levels up the hierarchy from the leaf nodes
(thus skipping the processor core level).

Note that arm64 / ACPI does not provide any means of identifying
a die level in the topology but that may be unrelate to the cluster
level.

[1] ACPI Specification 6.3 - section 5.2.29.1 processor hierarchy node
    structure (Type 0)
[2] https://github.com/hisilicon/hwloc/tree/linux-cluster

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210924085104.44806-2-21cnbao@gmail.com
---
 include/linux/acpi.h          | 5 +++++
 include/linux/arch_topology.h | 5 +++++
 include/linux/topology.h      | 6 ++++++
 3 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 974d497a897d..fbc2146050a4 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1353,6 +1353,7 @@ static inline int lpit_read_residency_count_address(u64 *address)
 #ifdef CONFIG_ACPI_PPTT
 int acpi_pptt_cpu_is_thread(unsigned int cpu);
 int find_acpi_cpu_topology(unsigned int cpu, int level);
+int find_acpi_cpu_topology_cluster(unsigned int cpu);
 int find_acpi_cpu_topology_package(unsigned int cpu);
 int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
 int find_acpi_cpu_cache_topology(unsigned int cpu, int level);
@@ -1365,6 +1366,10 @@ static inline int find_acpi_cpu_topology(unsigned int cpu, int level)
 {
 	return -EINVAL;
 }
+static inline int find_acpi_cpu_topology_cluster(unsigned int cpu)
+{
+	return -EINVAL;
+}
 static inline int find_acpi_cpu_topology_package(unsigned int cpu)
 {
 	return -EINVAL;
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index f180240dc95f..b97cea83b25e 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -62,10 +62,12 @@ void topology_set_thermal_pressure(const struct cpumask *cpus,
 struct cpu_topology {
 	int thread_id;
 	int core_id;
+	int cluster_id;
 	int package_id;
 	int llc_id;
 	cpumask_t thread_sibling;
 	cpumask_t core_sibling;
+	cpumask_t cluster_sibling;
 	cpumask_t llc_sibling;
 };
 
@@ -73,13 +75,16 @@ struct cpu_topology {
 extern struct cpu_topology cpu_topology[NR_CPUS];
 
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].package_id)
+#define topology_cluster_id(cpu)	(cpu_topology[cpu].cluster_id)
 #define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
 #define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
 #define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
+#define topology_cluster_cpumask(cpu)	(&cpu_topology[cpu].cluster_sibling)
 #define topology_llc_cpumask(cpu)	(&cpu_topology[cpu].llc_sibling)
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
+const struct cpumask *cpu_clustergroup_mask(int cpu);
 void update_siblings_masks(unsigned int cpu);
 void remove_cpu_topology(unsigned int cpuid);
 void reset_cpu_topology(void);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 7634cd737061..80d27d717631 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -186,6 +186,9 @@ static inline int cpu_to_mem(int cpu)
 #ifndef topology_die_id
 #define topology_die_id(cpu)			((void)(cpu), -1)
 #endif
+#ifndef topology_cluster_id
+#define topology_cluster_id(cpu)		((void)(cpu), -1)
+#endif
 #ifndef topology_core_id
 #define topology_core_id(cpu)			((void)(cpu), 0)
 #endif
@@ -195,6 +198,9 @@ static inline int cpu_to_mem(int cpu)
 #ifndef topology_core_cpumask
 #define topology_core_cpumask(cpu)		cpumask_of(cpu)
 #endif
+#ifndef topology_cluster_cpumask
+#define topology_cluster_cpumask(cpu)		cpumask_of(cpu)
+#endif
 #ifndef topology_die_cpumask
 #define topology_die_cpumask(cpu)		cpumask_of(cpu)
 #endif
-- 
cgit v1.2.3


From 778c558f49a2cb3dc7b18a80ff515e82aa813627 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Fri, 24 Sep 2021 20:51:03 +1200
Subject: sched: Add cluster scheduler level in core and related Kconfig for
 ARM64

This patch adds scheduler level for clusters and automatically enables
the load balance among clusters. It will directly benefit a lot of
workload which loves more resources such as memory bandwidth, caches.

Testing has widely been done in two different hardware configurations of
Kunpeng920:

 24 cores in one NUMA(6 clusters in each NUMA node);
 32 cores in one NUMA(8 clusters in each NUMA node)

Workload is running on either one NUMA node or four NUMA nodes, thus,
this can estimate the effect of cluster spreading w/ and w/o NUMA load
balance.

* Stream benchmark:

4threads stream (on 1NUMA * 24cores = 24cores)
                stream                 stream
                w/o patch              w/ patch
MB/sec copy     29929.64 (   0.00%)    32932.68 (  10.03%)
MB/sec scale    29861.10 (   0.00%)    32710.58 (   9.54%)
MB/sec add      27034.42 (   0.00%)    32400.68 (  19.85%)
MB/sec triad    27225.26 (   0.00%)    31965.36 (  17.41%)

6threads stream (on 1NUMA * 24cores = 24cores)
                stream                 stream
                w/o patch              w/ patch
MB/sec copy     40330.24 (   0.00%)    42377.68 (   5.08%)
MB/sec scale    40196.42 (   0.00%)    42197.90 (   4.98%)
MB/sec add      37427.00 (   0.00%)    41960.78 (  12.11%)
MB/sec triad    37841.36 (   0.00%)    42513.64 (  12.35%)

12threads stream (on 1NUMA * 24cores = 24cores)
                stream                 stream
                w/o patch              w/ patch
MB/sec copy     52639.82 (   0.00%)    53818.04 (   2.24%)
MB/sec scale    52350.30 (   0.00%)    53253.38 (   1.73%)
MB/sec add      53607.68 (   0.00%)    55198.82 (   2.97%)
MB/sec triad    54776.66 (   0.00%)    56360.40 (   2.89%)

Thus, it could help memory-bound workload especially under medium load.
Similar improvement is also seen in lkp-pbzip2:

* lkp-pbzip2 benchmark

2-96 threads (on 4NUMA * 24cores = 96cores)
                  lkp-pbzip2              lkp-pbzip2
                  w/o patch               w/ patch
Hmean     tput-2   11062841.57 (   0.00%)  11341817.51 *   2.52%*
Hmean     tput-5   26815503.70 (   0.00%)  27412872.65 *   2.23%*
Hmean     tput-8   41873782.21 (   0.00%)  43326212.92 *   3.47%*
Hmean     tput-12  61875980.48 (   0.00%)  64578337.51 *   4.37%*
Hmean     tput-21 105814963.07 (   0.00%) 111381851.01 *   5.26%*
Hmean     tput-30 150349470.98 (   0.00%) 156507070.73 *   4.10%*
Hmean     tput-48 237195937.69 (   0.00%) 242353597.17 *   2.17%*
Hmean     tput-79 360252509.37 (   0.00%) 362635169.23 *   0.66%*
Hmean     tput-96 394571737.90 (   0.00%) 400952978.48 *   1.62%*

2-24 threads (on 1NUMA * 24cores = 24cores)
                 lkp-pbzip2               lkp-pbzip2
                 w/o patch                w/ patch
Hmean     tput-2   11071705.49 (   0.00%)  11296869.10 *   2.03%*
Hmean     tput-4   20782165.19 (   0.00%)  21949232.15 *   5.62%*
Hmean     tput-6   30489565.14 (   0.00%)  33023026.96 *   8.31%*
Hmean     tput-8   40376495.80 (   0.00%)  42779286.27 *   5.95%*
Hmean     tput-12  61264033.85 (   0.00%)  62995632.78 *   2.83%*
Hmean     tput-18  86697139.39 (   0.00%)  86461545.74 (  -0.27%)
Hmean     tput-24 104854637.04 (   0.00%) 104522649.46 *  -0.32%*

In the case of 6 threads and 8 threads, we see the greatest performance
improvement.

Similar improvement can be seen on lkp-pixz though the improvement is
smaller:

* lkp-pixz benchmark

2-24 threads lkp-pixz (on 1NUMA * 24cores = 24cores)
                  lkp-pixz               lkp-pixz
                  w/o patch              w/ patch
Hmean     tput-2   6486981.16 (   0.00%)  6561515.98 *   1.15%*
Hmean     tput-4  11645766.38 (   0.00%) 11614628.43 (  -0.27%)
Hmean     tput-6  15429943.96 (   0.00%) 15957350.76 *   3.42%*
Hmean     tput-8  19974087.63 (   0.00%) 20413746.98 *   2.20%*
Hmean     tput-12 28172068.18 (   0.00%) 28751997.06 *   2.06%*
Hmean     tput-18 39413409.54 (   0.00%) 39896830.55 *   1.23%*
Hmean     tput-24 49101815.85 (   0.00%) 49418141.47 *   0.64%*

* SPECrate benchmark

4,8,16 copies mcf_r(on 1NUMA * 32cores = 32cores)
		Base     	 	Base
		Run Time   	 	Rate
		-------  	 	---------
4 Copies	w/o 580 (w/ 570)       	w/o 11.1 (w/ 11.3)
8 Copies	w/o 647 (w/ 605)       	w/o 20.0 (w/ 21.4, +7%)
16 Copies	w/o 844 (w/ 844)       	w/o 30.6 (w/ 30.6)

32 Copies(on 4NUMA * 32 cores = 128cores)
[w/o patch]
                 Base     Base        Base
Benchmarks       Copies  Run Time     Rate
--------------- -------  ---------  ---------
500.perlbench_r      32        584       87.2  *
502.gcc_r            32        503       90.2  *
505.mcf_r            32        745       69.4  *
520.omnetpp_r        32       1031       40.7  *
523.xalancbmk_r      32        597       56.6  *
525.x264_r            1         --            CE
531.deepsjeng_r      32        336      109    *
541.leela_r          32        556       95.4  *
548.exchange2_r      32        513      163    *
557.xz_r             32        530       65.2  *
 Est. SPECrate2017_int_base              80.3

[w/ patch]
                  Base     Base        Base
Benchmarks       Copies  Run Time     Rate
--------------- -------  ---------  ---------
500.perlbench_r      32        580      87.8 (+0.688%)  *
502.gcc_r            32        477      95.1 (+5.432%)  *
505.mcf_r            32        644      80.3 (+13.574%) *
520.omnetpp_r        32        942      44.6 (+9.58%)   *
523.xalancbmk_r      32        560      60.4 (+6.714%%) *
525.x264_r            1         --           CE
531.deepsjeng_r      32        337      109  (+0.000%) *
541.leela_r          32        554      95.6 (+0.210%) *
548.exchange2_r      32        515      163  (+0.000%) *
557.xz_r             32        524      66.0 (+1.227%) *
 Est. SPECrate2017_int_base              83.7 (+4.062%)

On the other hand, it is slightly helpful to CPU-bound tasks like
kernbench:

* 24-96 threads kernbench (on 4NUMA * 24cores = 96cores)
                     kernbench              kernbench
                     w/o cluster            w/ cluster
Min       user-24    12054.67 (   0.00%)    12024.19 (   0.25%)
Min       syst-24     1751.51 (   0.00%)     1731.68 (   1.13%)
Min       elsp-24      600.46 (   0.00%)      598.64 (   0.30%)
Min       user-48    12361.93 (   0.00%)    12315.32 (   0.38%)
Min       syst-48     1917.66 (   0.00%)     1892.73 (   1.30%)
Min       elsp-48      333.96 (   0.00%)      332.57 (   0.42%)
Min       user-96    12922.40 (   0.00%)    12921.17 (   0.01%)
Min       syst-96     2143.94 (   0.00%)     2110.39 (   1.56%)
Min       elsp-96      211.22 (   0.00%)      210.47 (   0.36%)
Amean     user-24    12063.99 (   0.00%)    12030.78 *   0.28%*
Amean     syst-24     1755.20 (   0.00%)     1735.53 *   1.12%*
Amean     elsp-24      601.60 (   0.00%)      600.19 (   0.23%)
Amean     user-48    12362.62 (   0.00%)    12315.56 *   0.38%*
Amean     syst-48     1921.59 (   0.00%)     1894.95 *   1.39%*
Amean     elsp-48      334.10 (   0.00%)      332.82 *   0.38%*
Amean     user-96    12925.27 (   0.00%)    12922.63 (   0.02%)
Amean     syst-96     2146.66 (   0.00%)     2122.20 *   1.14%*
Amean     elsp-96      211.96 (   0.00%)      211.79 (   0.08%)

Note this patch isn't an universal win, it might hurt those workload
which can benefit from packing. Though tasks which want to take
advantages of lower communication latency of one cluster won't
necessarily been packed in one cluster while kernel is not aware of
clusters, they have some chance to be randomly packed. But this
patch will make them more likely spread.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Tested-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched/topology.h | 7 +++++++
 include/linux/topology.h       | 7 +++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 8f0f778b7c91..2f9166f6dec8 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -42,6 +42,13 @@ static inline int cpu_smt_flags(void)
 }
 #endif
 
+#ifdef CONFIG_SCHED_CLUSTER
+static inline int cpu_cluster_flags(void)
+{
+	return SD_SHARE_PKG_RESOURCES;
+}
+#endif
+
 #ifdef CONFIG_SCHED_MC
 static inline int cpu_core_flags(void)
 {
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 80d27d717631..0b3704ad13c8 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -212,6 +212,13 @@ static inline const struct cpumask *cpu_smt_mask(int cpu)
 }
 #endif
 
+#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask)
+static inline const struct cpumask *cpu_cluster_mask(int cpu)
+{
+	return topology_cluster_cpumask(cpu);
+}
+#endif
+
 static inline const struct cpumask *cpu_cpu_mask(int cpu)
 {
 	return cpumask_of_node(cpu_to_node(cpu));
-- 
cgit v1.2.3


From 810979682ccc98dbd83f341c18a2e556c30a7164 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 6 Oct 2021 13:18:50 +0200
Subject: irq_work: Allow irq_work_sync() to sleep if irq_work() no IRQ
 support.

irq_work() triggers instantly an interrupt if supported by the
architecture. Otherwise the work will be processed on the next timer
tick. In worst case irq_work_sync() could spin up to a jiffy.

irq_work_sync() is usually used in tear down context which is fully
preemptible. Based on review irq_work_sync() is invoked from preemptible
context and there is one waiter at a time. This qualifies it to use
rcuwait for synchronisation.

Let irq_work_sync() synchronize with rcuwait if the architecture
processes irqwork via the timer tick.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211006111852.1514359-3-bigeasy@linutronix.de
---
 include/linux/irq_work.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index ec2a47a81e42..b48955e9c920 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -3,6 +3,7 @@
 #define _LINUX_IRQ_WORK_H
 
 #include <linux/smp_types.h>
+#include <linux/rcuwait.h>
 
 /*
  * An entry can be in one of four states:
@@ -16,11 +17,13 @@
 struct irq_work {
 	struct __call_single_node node;
 	void (*func)(struct irq_work *);
+	struct rcuwait irqwait;
 };
 
 #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){	\
 	.node = { .u_flags = (_flags), },			\
 	.func = (_func),					\
+	.irqwait = __RCUWAIT_INITIALIZER(irqwait),		\
 }
 
 #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0)
-- 
cgit v1.2.3


From 09089db79859cbccccd8df95b034f36f7027efa6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 6 Oct 2021 13:18:52 +0200
Subject: irq_work: Also rcuwait for !IRQ_WORK_HARD_IRQ on PREEMPT_RT

On PREEMPT_RT most items are processed as LAZY via softirq context.
Avoid to spin-wait for them because irq_work_sync() could have higher
priority and not allow the irq-work to be completed.

Wait additionally for !IRQ_WORK_HARD_IRQ irq_work items on PREEMPT_RT.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211006111852.1514359-5-bigeasy@linutronix.de
---
 include/linux/irq_work.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index b48955e9c920..8cd11a223260 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -49,6 +49,11 @@ static inline bool irq_work_is_busy(struct irq_work *work)
 	return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY;
 }
 
+static inline bool irq_work_is_hard(struct irq_work *work)
+{
+	return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ;
+}
+
 bool irq_work_queue(struct irq_work *work);
 bool irq_work_queue_on(struct irq_work *work, int cpu);
 
-- 
cgit v1.2.3


From 8b8ff8cc3b8155c18162e8b1f70e1230db176862 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 7 Sep 2021 19:39:01 +0300
Subject: perf/x86: Add new event for AUX output counter index

PEBS-via-PT records contain a mask of applicable counters. To identify
which event belongs to which counter, a side-band event is needed. Until
now, there has been no side-band event, and consequently users were limited
to using a single event.

Add such a side-band event. Note the event is optimised to output only
when the counter index changes for an event. That works only so long as
all PEBS-via-PT events are scheduled together, which they are for a
recording session because they are in a single group.

Also no attribute bit is used to select the new event, so a new
kernel is not compatible with older perf tools.  The assumption
being that PEBS-via-PT is sufficiently esoteric that users will not
be troubled by this.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210907163903.11820-2-adrian.hunter@intel.com
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2d510ad750ed..126b3a314f3a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1397,6 +1397,7 @@ perf_event_addr_filters(struct perf_event *event)
 }
 
 extern void perf_event_addr_filters_sync(struct perf_event *event);
+extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id);
 
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_sample_data *data,
-- 
cgit v1.2.3


From d00e60ee54b12de945b8493cf18c1ada9e422514 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Wed, 13 Oct 2021 17:19:20 +0800
Subject: page_pool: disable dma mapping support for 32-bit arch with 64-bit
 DMA

As the 32-bit arch with 64-bit DMA seems to rare those days,
and page pool might carry a lot of code and complexity for
systems that possibly.

So disable dma mapping support for such systems, if drivers
really want to work on such systems, they have to implement
their own DMA-mapping fallback tracking outside page_pool.

Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mm_types.h | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f8ee09c711f..436e0946d691 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -104,18 +104,7 @@ struct page {
 			struct page_pool *pp;
 			unsigned long _pp_mapping_pad;
 			unsigned long dma_addr;
-			union {
-				/**
-				 * dma_addr_upper: might require a 64-bit
-				 * value on 32-bit architectures.
-				 */
-				unsigned long dma_addr_upper;
-				/**
-				 * For frag page support, not supported in
-				 * 32-bit architectures with 64-bit DMA.
-				 */
-				atomic_long_t pp_frag_count;
-			};
+			atomic_long_t pp_frag_count;
 		};
 		struct {	/* slab, slob and slub */
 			union {
-- 
cgit v1.2.3


From 9028b2463c1ea96f51c3ba53e2479346019ff6ad Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Thu, 25 Mar 2021 15:08:44 +0100
Subject: tee: add sec_world_id to struct tee_shm

Adds sec_world_id to struct tee_shm which describes a shared memory
object. sec_world_id can be used by a driver to store an id assigned by
secure world.

Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 include/linux/tee_drv.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 3ebfea0781f1..a1f03461369b 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -197,7 +197,11 @@ int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method,
  * @num_pages:	number of locked pages
  * @dmabuf:	dmabuf used to for exporting to user space
  * @flags:	defined by TEE_SHM_* in tee_drv.h
- * @id:		unique id of a shared memory object on this device
+ * @id:		unique id of a shared memory object on this device, shared
+ *		with user space
+ * @sec_world_id:
+ *		secure world assigned id of this shared memory object, not
+ *		used by all drivers
  *
  * This pool is only supposed to be accessed directly from the TEE
  * subsystem and from drivers that implements their own shm pool manager.
@@ -213,6 +217,7 @@ struct tee_shm {
 	struct dma_buf *dmabuf;
 	u32 flags;
 	int id;
+	u64 sec_world_id;
 };
 
 /**
-- 
cgit v1.2.3


From 9955548919c47a6987b40d90a30fd56bbc043e7b Mon Sep 17 00:00:00 2001
From: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Date: Fri, 1 Oct 2021 12:12:30 +0200
Subject: remoteproc: Remove vdev_to_rvdev and vdev_to_rproc from remoteproc
 API

These both functions are only used by the remoteproc_virtio.
There is no reason to expose them in the API.
Move the functions in remoteproc_virtio.c

Signed-off-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20211001101234.4247-4-arnaud.pouliquen@foss.st.com
---
 include/linux/remoteproc.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 83c09ac36b13..e0600e1e5c17 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -684,18 +684,6 @@ int rproc_coredump_add_custom_segment(struct rproc *rproc,
 				      void *priv);
 int rproc_coredump_set_elf_info(struct rproc *rproc, u8 class, u16 machine);
 
-static inline struct rproc_vdev *vdev_to_rvdev(struct virtio_device *vdev)
-{
-	return container_of(vdev->dev.parent, struct rproc_vdev, dev);
-}
-
-static inline struct rproc *vdev_to_rproc(struct virtio_device *vdev)
-{
-	struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
-
-	return rvdev->rproc;
-}
-
 void rproc_add_subdev(struct rproc *rproc, struct rproc_subdev *subdev);
 
 void rproc_remove_subdev(struct rproc *rproc, struct rproc_subdev *subdev);
-- 
cgit v1.2.3


From 789c1093f02c436b320d78a739f9610c8271cb73 Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Mon, 11 Oct 2021 21:21:14 +0800
Subject: rtc: class: don't call cdev_device_del() when cdev_device_add()
 failed

I got a null-ptr-deref report when doing fault injection test:

general protection fault, probably for non-canonical address 0xdffffc0000000022: 0000 [#1] SMP KASAN PTI
KASAN: null-ptr-deref in range [0x0000000000000110-0x0000000000000117]
RIP: 0010:device_del+0x132/0xdc0
Call Trace:
 cdev_device_del+0x1a/0x80
 devm_rtc_unregister_device+0x37/0x80
 release_nodes+0xc3/0x3b0

If cdev_device_add() fails, 'dev->p' is not set, it causes
null-ptr-deref when calling cdev_device_del(). Registering
character device is optional, we don't return error code
here, so introduce a new flag 'RTC_NO_CDEV' to indicate
if it has character device, cdev_device_del() is called
when this bit is not set.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20211011132114.3663509-1-yangyingliang@huawei.com
---
 include/linux/rtc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index bd611e26291d..354e0843ab17 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -80,6 +80,7 @@ struct rtc_timer {
 
 /* flags */
 #define RTC_DEV_BUSY 0
+#define RTC_NO_CDEV  1
 
 struct rtc_device {
 	struct device dev;
-- 
cgit v1.2.3


From 4b2c5fa9c9902ce34ecea6711558d9af96351b31 Mon Sep 17 00:00:00 2001
From: Amir Tzin <amirtz@nvidia.com>
Date: Wed, 21 Jul 2021 16:14:12 +0300
Subject: net/mlx5: Add layout to support default timeouts register

Add needed structures and defines for DTOR (default timeouts register).
This will be used to get timeouts values from FW instead of hard coded
values in the driver code thus enabling support for slower devices which
need longer timeouts.

Signed-off-by: Amir Tzin <amirtz@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h   |  4 +++-
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 37 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 66eaf0aa7f69..109cc8106d16 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -577,7 +577,9 @@ struct mlx5_init_seg {
 	__be32			rsvd1[120];
 	__be32			initializing;
 	struct health_buffer	health;
-	__be32			rsvd2[880];
+	__be32			rsvd2[878];
+	__be32			cmd_exec_to;
+	__be32			cmd_q_init_to;
 	__be32			internal_timer_h;
 	__be32			internal_timer_l;
 	__be32			rsvd3[2];
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 0ca719c00824..ccbd87fbd3bf 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -156,6 +156,7 @@ enum {
 	MLX5_REG_MIRC		 = 0x9162,
 	MLX5_REG_SBCAM		 = 0xB01F,
 	MLX5_REG_RESOURCE_DUMP   = 0xC000,
+	MLX5_REG_DTOR            = 0xC00E,
 };
 
 enum mlx5_qpts_trust_state {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 993204a6c1a1..b8bff5109656 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1306,7 +1306,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         vhca_resource_manager[0x1];
 
 	u8         hca_cap_2[0x1];
-	u8         reserved_at_21[0x2];
+	u8         reserved_at_21[0x1];
+	u8         dtor[0x1];
 	u8         event_on_vhca_state_teardown_request[0x1];
 	u8         event_on_vhca_state_in_use[0x1];
 	u8         event_on_vhca_state_active[0x1];
@@ -2807,6 +2808,40 @@ struct mlx5_ifc_dropped_packet_logged_bits {
 	u8         reserved_at_0[0xe0];
 };
 
+struct mlx5_ifc_default_timeout_bits {
+	u8         to_multiplier[0x3];
+	u8         reserved_at_3[0x9];
+	u8         to_value[0x14];
+};
+
+struct mlx5_ifc_dtor_reg_bits {
+	u8         reserved_at_0[0x20];
+
+	struct mlx5_ifc_default_timeout_bits pcie_toggle_to;
+
+	u8         reserved_at_40[0x60];
+
+	struct mlx5_ifc_default_timeout_bits health_poll_to;
+
+	struct mlx5_ifc_default_timeout_bits full_crdump_to;
+
+	struct mlx5_ifc_default_timeout_bits fw_reset_to;
+
+	struct mlx5_ifc_default_timeout_bits flush_on_err_to;
+
+	struct mlx5_ifc_default_timeout_bits pci_sync_update_to;
+
+	struct mlx5_ifc_default_timeout_bits tear_down_to;
+
+	struct mlx5_ifc_default_timeout_bits fsm_reactivate_to;
+
+	struct mlx5_ifc_default_timeout_bits reclaim_pages_to;
+
+	struct mlx5_ifc_default_timeout_bits reclaim_vfs_pages_to;
+
+	u8         reserved_at_1c0[0x40];
+};
+
 enum {
 	MLX5_CQ_ERROR_SYNDROME_CQ_OVERRUN                 = 0x1,
 	MLX5_CQ_ERROR_SYNDROME_CQ_ACCESS_VIOLATION_ERROR  = 0x2,
-- 
cgit v1.2.3


From 5945e1adeab527ec96c75a786213c146d4d482a4 Mon Sep 17 00:00:00 2001
From: Amir Tzin <amirtz@mellanox.com>
Date: Thu, 7 Oct 2021 18:00:27 +0300
Subject: net/mlx5: Read timeout values from init segment

Replace hard coded timeouts with values stored in firmware's init
segment. Timeouts are read from init segment during driver load. If init
segment timeouts are not supported then fallback to hard coded defaults
instead. Also move pre initialization timeouts which cannot be read from
firmware to the new mechanism.

Signed-off-by: Amir Tzin <amirtz@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ccbd87fbd3bf..fb06e8870aee 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -66,10 +66,6 @@ enum {
 };
 
 enum {
-	/* one minute for the sake of bringup. Generally, commands must always
-	 * complete and we may need to increase this timeout value
-	 */
-	MLX5_CMD_TIMEOUT_MSEC	= 60 * 1000,
 	MLX5_CMD_WQ_MAX_NAME	= 32,
 };
 
@@ -755,6 +751,7 @@ struct mlx5_core_dev {
 		u32 qcam[MLX5_ST_SZ_DW(qcam_reg)];
 		u8  embedded_cpu;
 	} caps;
+	struct mlx5_timeouts	*timeouts;
 	u64			sys_image_guid;
 	phys_addr_t		iseg_base;
 	struct mlx5_init_seg __iomem *iseg;
-- 
cgit v1.2.3


From fbfa97b4d79f26042f188b84959065213e9d3e99 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Wed, 18 Aug 2021 13:21:30 +0300
Subject: net/mlx5: Disable roce at HCA level

Currently, when a user disables roce via the devlink param, this change
isn't passed down to the device.
If device allows disabling RoCE at device level, make use of it. This
instructs the device to skip memory allocations related to RoCE
functionality which otherwise is done by the device.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h   | 9 +++++----
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index fb06e8870aee..7c8b5f06c2cd 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1251,11 +1251,12 @@ static inline bool mlx5_is_roce_init_enabled(struct mlx5_core_dev *dev)
 {
 	struct devlink *devlink = priv_to_devlink(dev);
 	union devlink_param_value val;
+	int err;
 
-	devlink_param_driverinit_value_get(devlink,
-					   DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
-					   &val);
-	return val.vbool;
+	err = devlink_param_driverinit_value_get(devlink,
+						 DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
+						 &val);
+	return err ? MLX5_CAP_GEN(dev, roce) : val.vbool;
 }
 
 #endif /* MLX5_DRIVER_H */
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index b8bff5109656..c614ad1da44d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1588,7 +1588,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_tis_per_sq[0x5];
 
 	u8         ext_stride_num_range[0x1];
-	u8         reserved_at_3a1[0x2];
+	u8         roce_rw_supported[0x1];
+	u8         reserved_at_3a2[0x1];
 	u8         log_max_stride_sz_rq[0x5];
 	u8         reserved_at_3a8[0x3];
 	u8         log_min_stride_sz_rq[0x5];
-- 
cgit v1.2.3


From 2ec16ddde1fa31a83aee04320b248e94348d9152 Mon Sep 17 00:00:00 2001
From: Rongwei Liu <rongweil@nvidia.com>
Date: Thu, 16 Sep 2021 10:46:17 +0300
Subject: net/mlx5: Introduce new device index wrapper

Downstream patches.

Signed-off-by: Rongwei Liu <rongweil@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7c8b5f06c2cd..aecc38b90de5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1243,6 +1243,11 @@ static inline int mlx5_core_native_port_num(struct mlx5_core_dev *dev)
 	return MLX5_CAP_GEN(dev, native_port_num);
 }
 
+static inline int mlx5_get_dev_index(struct mlx5_core_dev *dev)
+{
+	return PCI_FUNC(dev->pdev->devfn);
+}
+
 enum {
 	MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32,
 };
-- 
cgit v1.2.3


From 1021d0645d593ea86193c5fc371e33e5b208e14d Mon Sep 17 00:00:00 2001
From: Rongwei Liu <rongweil@nvidia.com>
Date: Tue, 12 Oct 2021 10:40:52 +0300
Subject: net/mlx5: Use native_port_num as 1st option of device index

Using "native_port_num" can support more NICs.

Fallback to PCIe IDs if "native_port_num" query fails.

Signed-off-by: Rongwei Liu <rongweil@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index aecc38b90de5..cf508685abca 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1245,7 +1245,12 @@ static inline int mlx5_core_native_port_num(struct mlx5_core_dev *dev)
 
 static inline int mlx5_get_dev_index(struct mlx5_core_dev *dev)
 {
-	return PCI_FUNC(dev->pdev->devfn);
+	int idx = MLX5_CAP_GEN(dev, native_port_num);
+
+	if (idx >= 1 && idx <= MLX5_MAX_PORTS)
+		return idx - 1;
+	else
+		return PCI_FUNC(dev->pdev->devfn);
 }
 
 enum {
-- 
cgit v1.2.3


From 72bf80cf09c4693780ad93a31b48fa5a4e17a946 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ma=C3=ADra=20Canal?= <maira.canal@usp.br>
Date: Fri, 15 Oct 2021 12:14:35 -0300
Subject: regulator: lp872x: replacing legacy gpio interface for gpiod
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removing all linux/gpio.h and linux/of_gpio.h dependencies and replacing
them with the gpiod interface

Signed-off-by: Maíra Canal <maira.canal@usp.br>
Message-Id: <YWma2yTyuwS5XwhY@fedora>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/lp872x.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/lp872x.h b/include/linux/regulator/lp872x.h
index d780dbb8b423..8e7e0343c6e1 100644
--- a/include/linux/regulator/lp872x.h
+++ b/include/linux/regulator/lp872x.h
@@ -10,7 +10,7 @@
 
 #include <linux/regulator/machine.h>
 #include <linux/platform_device.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 
 #define LP872X_MAX_REGULATORS		9
 
@@ -41,8 +41,8 @@ enum lp872x_regulator_id {
 };
 
 enum lp872x_dvs_state {
-	DVS_LOW  = GPIOF_OUT_INIT_LOW,
-	DVS_HIGH = GPIOF_OUT_INIT_HIGH,
+	DVS_LOW  = GPIOD_OUT_LOW,
+	DVS_HIGH = GPIOD_OUT_HIGH,
 };
 
 enum lp872x_dvs_sel {
@@ -52,12 +52,12 @@ enum lp872x_dvs_sel {
 
 /**
  * lp872x_dvs
- * @gpio       : gpio pin number for dvs control
+ * @gpio       : gpio descriptor for dvs control
  * @vsel       : dvs selector for buck v1 or buck v2 register
  * @init_state : initial dvs pin state
  */
 struct lp872x_dvs {
-	int gpio;
+	struct gpio_desc *gpio;
 	enum lp872x_dvs_sel vsel;
 	enum lp872x_dvs_state init_state;
 };
@@ -78,14 +78,14 @@ struct lp872x_regulator_data {
  * @update_config     : if LP872X_GENERAL_CFG register is updated, set true
  * @regulator_data    : platform regulator id and init data
  * @dvs               : dvs data for buck voltage control
- * @enable_gpio       : gpio pin number for enable control
+ * @enable_gpio       : gpio descriptor for enable control
  */
 struct lp872x_platform_data {
 	u8 general_config;
 	bool update_config;
 	struct lp872x_regulator_data regulator_data[LP872X_MAX_REGULATORS];
 	struct lp872x_dvs *dvs;
-	int enable_gpio;
+	struct gpio_desc *enable_gpio;
 };
 
 #endif
-- 
cgit v1.2.3


From c3f69c7f629ff53c75f335754ea41426b8e095de Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 12 Oct 2021 16:35:14 -0700
Subject: scsi: ata: Switch to attribute groups

struct device supports attribute groups directly but does not support
struct device_attribute directly. Hence switch to attribute groups.

Link: https://lore.kernel.org/r/20211012233558.4066756-3-bvanassche@acm.org
Acked-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/libata.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index c0c64f03e107..bd1b782d1bbf 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1388,7 +1388,7 @@ extern int ata_link_nr_enabled(struct ata_link *link);
  */
 extern const struct ata_port_operations ata_base_port_ops;
 extern const struct ata_port_operations sata_port_ops;
-extern struct device_attribute *ata_common_sdev_attrs[];
+extern const struct attribute_group *ata_common_sdev_groups[];
 
 /*
  * All sht initializers (BASE, PIO, BMDMA, NCQ) must be instantiated
@@ -1418,14 +1418,14 @@ extern struct device_attribute *ata_common_sdev_attrs[];
 
 #define ATA_BASE_SHT(drv_name)					\
 	ATA_SUBBASE_SHT(drv_name),				\
-	.sdev_attrs		= ata_common_sdev_attrs
+	.sdev_groups		= ata_common_sdev_groups
 
 #ifdef CONFIG_SATA_HOST
-extern struct device_attribute *ata_ncq_sdev_attrs[];
+extern const struct attribute_group *ata_ncq_sdev_groups[];
 
 #define ATA_NCQ_SHT(drv_name)					\
 	ATA_SUBBASE_SHT(drv_name),				\
-	.sdev_attrs		= ata_ncq_sdev_attrs,		\
+	.sdev_groups		= ata_ncq_sdev_groups,		\
 	.change_queue_depth	= ata_scsi_change_queue_depth
 #endif
 
-- 
cgit v1.2.3


From 05593a3fd1037b5fee85d3c8c28112f19e7baa06 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Fri, 27 Aug 2021 12:47:45 +0900
Subject: counter: stm32-lptimer-cnt: Provide defines for clock polarities

The STM32 low-power timer permits configuration of the clock polarity
via the LPTIMX_CFGR register CKPOL bits. This patch provides
preprocessor defines for the supported clock polarities.

Cc: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Reviewed-by: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Link: https://lore.kernel.org/r/a111c8905c467805ca530728f88189b59430f27e.1630031207.git.vilhelm.gray@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/mfd/stm32-lptimer.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/stm32-lptimer.h b/include/linux/mfd/stm32-lptimer.h
index 90b20550c1c8..06d3f11dc3c9 100644
--- a/include/linux/mfd/stm32-lptimer.h
+++ b/include/linux/mfd/stm32-lptimer.h
@@ -45,6 +45,11 @@
 #define STM32_LPTIM_PRESC	GENMASK(11, 9)
 #define STM32_LPTIM_CKPOL	GENMASK(2, 1)
 
+/* STM32_LPTIM_CKPOL */
+#define STM32_LPTIM_CKPOL_RISING_EDGE	0
+#define STM32_LPTIM_CKPOL_FALLING_EDGE	1
+#define STM32_LPTIM_CKPOL_BOTH_EDGES	2
+
 /* STM32_LPTIM_ARR */
 #define STM32_LPTIM_MAX_ARR	0xFFFF
 
-- 
cgit v1.2.3


From ea434ff82649111de4fcabd76187270f8abdb63a Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Fri, 27 Aug 2021 12:47:46 +0900
Subject: counter: stm32-timer-cnt: Provide defines for slave mode selection

The STM32 timer permits configuration of the counter encoder mode via
the slave mode control register (SMCR) slave mode selection (SMS) bits.
This patch provides preprocessor defines for the supported encoder
modes.

Cc: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Reviewed-by: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Link: https://lore.kernel.org/r/ad3d9cd7af580d586316d368f74964cbc394f981.1630031207.git.vilhelm.gray@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/mfd/stm32-timers.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h
index f8db83aedb2b..5f5c43fd69dd 100644
--- a/include/linux/mfd/stm32-timers.h
+++ b/include/linux/mfd/stm32-timers.h
@@ -82,6 +82,10 @@
 #define MAX_TIM_ICPSC		0x3
 #define TIM_CR2_MMS_SHIFT	4
 #define TIM_CR2_MMS2_SHIFT	20
+#define TIM_SMCR_SMS_SLAVE_MODE_DISABLED	0 /* counts on internal clock when CEN=1 */
+#define TIM_SMCR_SMS_ENCODER_MODE_1		1 /* counts TI1FP1 edges, depending on TI2FP2 level */
+#define TIM_SMCR_SMS_ENCODER_MODE_2		2 /* counts TI2FP2 edges, depending on TI1FP1 level */
+#define TIM_SMCR_SMS_ENCODER_MODE_3		3 /* counts on both TI1FP1 and TI2FP2 edges */
 #define TIM_SMCR_TS_SHIFT	4
 #define TIM_BDTR_BKF_MASK	0xF
 #define TIM_BDTR_BKF_SHIFT(x)	(16 + (x) * 4)
-- 
cgit v1.2.3


From aaec1a0f76ec25f46bbb17b81487c4b0e1c318c5 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Fri, 27 Aug 2021 12:47:47 +0900
Subject: counter: Internalize sysfs interface code

This is a reimplementation of the Generic Counter driver interface.
There are no modifications to the Counter subsystem userspace interface,
so existing userspace applications should continue to run seamlessly.

The purpose of this patch is to internalize the sysfs interface code
among the various counter drivers into a shared module. Counter drivers
pass and take data natively (i.e. u8, u64, etc.) and the shared counter
module handles the translation between the sysfs interface and the
device drivers. This guarantees a standard userspace interface for all
counter drivers, and helps generalize the Generic Counter driver ABI in
order to support the Generic Counter chrdev interface (introduced in a
subsequent patch) without significant changes to the existing counter
drivers.

Note, Counter device registration is the same as before: drivers
populate a struct counter_device with components and callbacks, then
pass the structure to the devm_counter_register function. However,
what's different now is how the Counter subsystem code handles this
registration internally.

Whereas before callbacks would interact directly with sysfs data, this
interaction is now abstracted and instead callbacks interact with native
C data types. The counter_comp structure forms the basis for Counter
extensions.

The counter-sysfs.c file contains the code to parse through the
counter_device structure and register the requested components and
extensions. Attributes are created and populated based on type, with
respective translation functions to handle the mapping between sysfs and
the counter driver callbacks.

The translation performed for each attribute is straightforward: the
attribute type and data is parsed from the counter_attribute structure,
the respective counter driver read/write callback is called, and sysfs
I/O is handled before or after the driver read/write function is called.

Cc: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Cc: Patrick Havelange <patrick.havelange@essensium.com>
Cc: Kamel Bouhara <kamel.bouhara@bootlin.com>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Syed Nayyar Waris <syednwaris@gmail.com>
Reviewed-by: David Lechner <david@lechnology.com>
Tested-by: David Lechner <david@lechnology.com>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Reviewed-by: Fabrice Gasnier <fabrice.gasnier@foss.st.com> # for stm32
Link: https://lore.kernel.org/r/c68b4a1ffb195c1a2f65e8dd5ad7b7c14e79c6ef.1630031207.git.vilhelm.gray@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/counter.h      | 658 ++++++++++++++++++++++---------------------
 include/linux/counter_enum.h |  45 ---
 2 files changed, 338 insertions(+), 365 deletions(-)
 delete mode 100644 include/linux/counter_enum.h

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index d16ce2819b48..b69277f5c4c5 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -6,42 +6,184 @@
 #ifndef _COUNTER_H_
 #define _COUNTER_H_
 
-#include <linux/counter_enum.h>
 #include <linux/device.h>
+#include <linux/kernel.h>
 #include <linux/types.h>
 
+struct counter_device;
+struct counter_count;
+struct counter_synapse;
+struct counter_signal;
+
+enum counter_comp_type {
+	COUNTER_COMP_U8,
+	COUNTER_COMP_U64,
+	COUNTER_COMP_BOOL,
+	COUNTER_COMP_SIGNAL_LEVEL,
+	COUNTER_COMP_FUNCTION,
+	COUNTER_COMP_SYNAPSE_ACTION,
+	COUNTER_COMP_ENUM,
+	COUNTER_COMP_COUNT_DIRECTION,
+	COUNTER_COMP_COUNT_MODE,
+};
+
+enum counter_scope {
+	COUNTER_SCOPE_DEVICE,
+	COUNTER_SCOPE_SIGNAL,
+	COUNTER_SCOPE_COUNT,
+};
+
 enum counter_count_direction {
-	COUNTER_COUNT_DIRECTION_FORWARD = 0,
-	COUNTER_COUNT_DIRECTION_BACKWARD
+	COUNTER_COUNT_DIRECTION_FORWARD,
+	COUNTER_COUNT_DIRECTION_BACKWARD,
 };
-extern const char *const counter_count_direction_str[2];
 
 enum counter_count_mode {
-	COUNTER_COUNT_MODE_NORMAL = 0,
+	COUNTER_COUNT_MODE_NORMAL,
 	COUNTER_COUNT_MODE_RANGE_LIMIT,
 	COUNTER_COUNT_MODE_NON_RECYCLE,
-	COUNTER_COUNT_MODE_MODULO_N
+	COUNTER_COUNT_MODE_MODULO_N,
 };
-extern const char *const counter_count_mode_str[4];
 
-struct counter_device;
-struct counter_signal;
+enum counter_function {
+	COUNTER_FUNCTION_INCREASE,
+	COUNTER_FUNCTION_DECREASE,
+	COUNTER_FUNCTION_PULSE_DIRECTION,
+	COUNTER_FUNCTION_QUADRATURE_X1_A,
+	COUNTER_FUNCTION_QUADRATURE_X1_B,
+	COUNTER_FUNCTION_QUADRATURE_X2_A,
+	COUNTER_FUNCTION_QUADRATURE_X2_B,
+	COUNTER_FUNCTION_QUADRATURE_X4,
+};
+
+enum counter_signal_level {
+	COUNTER_SIGNAL_LEVEL_LOW,
+	COUNTER_SIGNAL_LEVEL_HIGH,
+};
+
+enum counter_synapse_action {
+	COUNTER_SYNAPSE_ACTION_NONE,
+	COUNTER_SYNAPSE_ACTION_RISING_EDGE,
+	COUNTER_SYNAPSE_ACTION_FALLING_EDGE,
+	COUNTER_SYNAPSE_ACTION_BOTH_EDGES,
+};
 
 /**
- * struct counter_signal_ext - Counter Signal extensions
- * @name:	attribute name
- * @read:	read callback for this attribute; may be NULL
- * @write:	write callback for this attribute; may be NULL
- * @priv:	data private to the driver
+ * struct counter_comp - Counter component node
+ * @type:		Counter component data type
+ * @name:		device-specific component name
+ * @priv:		component-relevant data
+ * @action_read		Synapse action mode read callback. The read value of the
+ *			respective Synapse action mode should be passed back via
+ *			the action parameter.
+ * @device_u8_read	Device u8 component read callback. The read value of the
+ *			respective Device u8 component should be passed back via
+ *			the val parameter.
+ * @count_u8_read	Count u8 component read callback. The read value of the
+ *			respective Count u8 component should be passed back via
+ *			the val parameter.
+ * @signal_u8_read	Signal u8 component read callback. The read value of the
+ *			respective Signal u8 component should be passed back via
+ *			the val parameter.
+ * @device_u32_read	Device u32 component read callback. The read value of
+ *			the respective Device u32 component should be passed
+ *			back via the val parameter.
+ * @count_u32_read	Count u32 component read callback. The read value of the
+ *			respective Count u32 component should be passed back via
+ *			the val parameter.
+ * @signal_u32_read	Signal u32 component read callback. The read value of
+ *			the respective Signal u32 component should be passed
+ *			back via the val parameter.
+ * @device_u64_read	Device u64 component read callback. The read value of
+ *			the respective Device u64 component should be passed
+ *			back via the val parameter.
+ * @count_u64_read	Count u64 component read callback. The read value of the
+ *			respective Count u64 component should be passed back via
+ *			the val parameter.
+ * @signal_u64_read	Signal u64 component read callback. The read value of
+ *			the respective Signal u64 component should be passed
+ *			back via the val parameter.
+ * @action_write	Synapse action mode write callback. The write value of
+ *			the respective Synapse action mode is passed via the
+ *			action parameter.
+ * @device_u8_write	Device u8 component write callback. The write value of
+ *			the respective Device u8 component is passed via the val
+ *			parameter.
+ * @count_u8_write	Count u8 component write callback. The write value of
+ *			the respective Count u8 component is passed via the val
+ *			parameter.
+ * @signal_u8_write	Signal u8 component write callback. The write value of
+ *			the respective Signal u8 component is passed via the val
+ *			parameter.
+ * @device_u32_write	Device u32 component write callback. The write value of
+ *			the respective Device u32 component is passed via the
+ *			val parameter.
+ * @count_u32_write	Count u32 component write callback. The write value of
+ *			the respective Count u32 component is passed via the val
+ *			parameter.
+ * @signal_u32_write	Signal u32 component write callback. The write value of
+ *			the respective Signal u32 component is passed via the
+ *			val parameter.
+ * @device_u64_write	Device u64 component write callback. The write value of
+ *			the respective Device u64 component is passed via the
+ *			val parameter.
+ * @count_u64_write	Count u64 component write callback. The write value of
+ *			the respective Count u64 component is passed via the val
+ *			parameter.
+ * @signal_u64_write	Signal u64 component write callback. The write value of
+ *			the respective Signal u64 component is passed via the
+ *			val parameter.
  */
-struct counter_signal_ext {
+struct counter_comp {
+	enum counter_comp_type type;
 	const char *name;
-	ssize_t (*read)(struct counter_device *counter,
-			struct counter_signal *signal, void *priv, char *buf);
-	ssize_t (*write)(struct counter_device *counter,
-			 struct counter_signal *signal, void *priv,
-			 const char *buf, size_t len);
 	void *priv;
+	union {
+		int (*action_read)(struct counter_device *counter,
+				   struct counter_count *count,
+				   struct counter_synapse *synapse,
+				   enum counter_synapse_action *action);
+		int (*device_u8_read)(struct counter_device *counter, u8 *val);
+		int (*count_u8_read)(struct counter_device *counter,
+				     struct counter_count *count, u8 *val);
+		int (*signal_u8_read)(struct counter_device *counter,
+				      struct counter_signal *signal, u8 *val);
+		int (*device_u32_read)(struct counter_device *counter,
+				       u32 *val);
+		int (*count_u32_read)(struct counter_device *counter,
+				      struct counter_count *count, u32 *val);
+		int (*signal_u32_read)(struct counter_device *counter,
+				       struct counter_signal *signal, u32 *val);
+		int (*device_u64_read)(struct counter_device *counter,
+				       u64 *val);
+		int (*count_u64_read)(struct counter_device *counter,
+				      struct counter_count *count, u64 *val);
+		int (*signal_u64_read)(struct counter_device *counter,
+				       struct counter_signal *signal, u64 *val);
+	};
+	union {
+		int (*action_write)(struct counter_device *counter,
+				    struct counter_count *count,
+				    struct counter_synapse *synapse,
+				    enum counter_synapse_action action);
+		int (*device_u8_write)(struct counter_device *counter, u8 val);
+		int (*count_u8_write)(struct counter_device *counter,
+				      struct counter_count *count, u8 val);
+		int (*signal_u8_write)(struct counter_device *counter,
+				       struct counter_signal *signal, u8 val);
+		int (*device_u32_write)(struct counter_device *counter,
+					u32 val);
+		int (*count_u32_write)(struct counter_device *counter,
+				       struct counter_count *count, u32 val);
+		int (*signal_u32_write)(struct counter_device *counter,
+					struct counter_signal *signal, u32 val);
+		int (*device_u64_write)(struct counter_device *counter,
+					u64 val);
+		int (*count_u64_write)(struct counter_device *counter,
+				       struct counter_count *count, u64 val);
+		int (*signal_u64_write)(struct counter_device *counter,
+					struct counter_signal *signal, u64 val);
+	};
 };
 
 /**
@@ -51,248 +193,52 @@ struct counter_signal_ext {
  *		as it appears in the datasheet documentation
  * @ext:	optional array of Counter Signal extensions
  * @num_ext:	number of Counter Signal extensions specified in @ext
- * @priv:	optional private data supplied by driver
  */
 struct counter_signal {
 	int id;
 	const char *name;
 
-	const struct counter_signal_ext *ext;
+	struct counter_comp *ext;
 	size_t num_ext;
-
-	void *priv;
-};
-
-/**
- * struct counter_signal_enum_ext - Signal enum extension attribute
- * @items:	Array of strings
- * @num_items:	Number of items specified in @items
- * @set:	Set callback function; may be NULL
- * @get:	Get callback function; may be NULL
- *
- * The counter_signal_enum_ext structure can be used to implement enum style
- * Signal extension attributes. Enum style attributes are those which have a set
- * of strings that map to unsigned integer values. The Generic Counter Signal
- * enum extension helper code takes care of mapping between value and string, as
- * well as generating a "_available" file which contains a list of all available
- * items. The get callback is used to query the currently active item; the index
- * of the item within the respective items array is returned via the 'item'
- * parameter. The set callback is called when the attribute is updated; the
- * 'item' parameter contains the index of the newly activated item within the
- * respective items array.
- */
-struct counter_signal_enum_ext {
-	const char * const *items;
-	size_t num_items;
-	int (*get)(struct counter_device *counter,
-		   struct counter_signal *signal, size_t *item);
-	int (*set)(struct counter_device *counter,
-		   struct counter_signal *signal, size_t item);
-};
-
-/**
- * COUNTER_SIGNAL_ENUM() - Initialize Signal enum extension
- * @_name:	Attribute name
- * @_e:		Pointer to a counter_signal_enum_ext structure
- *
- * This should usually be used together with COUNTER_SIGNAL_ENUM_AVAILABLE()
- */
-#define COUNTER_SIGNAL_ENUM(_name, _e) \
-{ \
-	.name = (_name), \
-	.read = counter_signal_enum_read, \
-	.write = counter_signal_enum_write, \
-	.priv = (_e) \
-}
-
-/**
- * COUNTER_SIGNAL_ENUM_AVAILABLE() - Initialize Signal enum available extension
- * @_name:	Attribute name ("_available" will be appended to the name)
- * @_e:		Pointer to a counter_signal_enum_ext structure
- *
- * Creates a read only attribute that lists all the available enum items in a
- * newline separated list. This should usually be used together with
- * COUNTER_SIGNAL_ENUM()
- */
-#define COUNTER_SIGNAL_ENUM_AVAILABLE(_name, _e) \
-{ \
-	.name = (_name "_available"), \
-	.read = counter_signal_enum_available_read, \
-	.priv = (_e) \
-}
-
-enum counter_synapse_action {
-	COUNTER_SYNAPSE_ACTION_NONE = 0,
-	COUNTER_SYNAPSE_ACTION_RISING_EDGE,
-	COUNTER_SYNAPSE_ACTION_FALLING_EDGE,
-	COUNTER_SYNAPSE_ACTION_BOTH_EDGES
 };
 
 /**
  * struct counter_synapse - Counter Synapse node
- * @action:		index of current action mode
  * @actions_list:	array of available action modes
  * @num_actions:	number of action modes specified in @actions_list
  * @signal:		pointer to associated signal
  */
 struct counter_synapse {
-	size_t action;
 	const enum counter_synapse_action *actions_list;
 	size_t num_actions;
 
 	struct counter_signal *signal;
 };
 
-struct counter_count;
-
-/**
- * struct counter_count_ext - Counter Count extension
- * @name:	attribute name
- * @read:	read callback for this attribute; may be NULL
- * @write:	write callback for this attribute; may be NULL
- * @priv:	data private to the driver
- */
-struct counter_count_ext {
-	const char *name;
-	ssize_t (*read)(struct counter_device *counter,
-			struct counter_count *count, void *priv, char *buf);
-	ssize_t (*write)(struct counter_device *counter,
-			 struct counter_count *count, void *priv,
-			 const char *buf, size_t len);
-	void *priv;
-};
-
-enum counter_function {
-	COUNTER_FUNCTION_INCREASE = 0,
-	COUNTER_FUNCTION_DECREASE,
-	COUNTER_FUNCTION_PULSE_DIRECTION,
-	COUNTER_FUNCTION_QUADRATURE_X1_A,
-	COUNTER_FUNCTION_QUADRATURE_X1_B,
-	COUNTER_FUNCTION_QUADRATURE_X2_A,
-	COUNTER_FUNCTION_QUADRATURE_X2_B,
-	COUNTER_FUNCTION_QUADRATURE_X4
-};
-
 /**
  * struct counter_count - Counter Count node
  * @id:			unique ID used to identify Count
  * @name:		device-specific Count name; ideally, this should match
  *			the name as it appears in the datasheet documentation
- * @function:		index of current function mode
  * @functions_list:	array available function modes
  * @num_functions:	number of function modes specified in @functions_list
  * @synapses:		array of synapses for initialization
  * @num_synapses:	number of synapses specified in @synapses
  * @ext:		optional array of Counter Count extensions
  * @num_ext:		number of Counter Count extensions specified in @ext
- * @priv:		optional private data supplied by driver
  */
 struct counter_count {
 	int id;
 	const char *name;
 
-	size_t function;
 	const enum counter_function *functions_list;
 	size_t num_functions;
 
 	struct counter_synapse *synapses;
 	size_t num_synapses;
 
-	const struct counter_count_ext *ext;
+	struct counter_comp *ext;
 	size_t num_ext;
-
-	void *priv;
-};
-
-/**
- * struct counter_count_enum_ext - Count enum extension attribute
- * @items:	Array of strings
- * @num_items:	Number of items specified in @items
- * @set:	Set callback function; may be NULL
- * @get:	Get callback function; may be NULL
- *
- * The counter_count_enum_ext structure can be used to implement enum style
- * Count extension attributes. Enum style attributes are those which have a set
- * of strings that map to unsigned integer values. The Generic Counter Count
- * enum extension helper code takes care of mapping between value and string, as
- * well as generating a "_available" file which contains a list of all available
- * items. The get callback is used to query the currently active item; the index
- * of the item within the respective items array is returned via the 'item'
- * parameter. The set callback is called when the attribute is updated; the
- * 'item' parameter contains the index of the newly activated item within the
- * respective items array.
- */
-struct counter_count_enum_ext {
-	const char * const *items;
-	size_t num_items;
-	int (*get)(struct counter_device *counter, struct counter_count *count,
-		   size_t *item);
-	int (*set)(struct counter_device *counter, struct counter_count *count,
-		   size_t item);
-};
-
-/**
- * COUNTER_COUNT_ENUM() - Initialize Count enum extension
- * @_name:	Attribute name
- * @_e:		Pointer to a counter_count_enum_ext structure
- *
- * This should usually be used together with COUNTER_COUNT_ENUM_AVAILABLE()
- */
-#define COUNTER_COUNT_ENUM(_name, _e) \
-{ \
-	.name = (_name), \
-	.read = counter_count_enum_read, \
-	.write = counter_count_enum_write, \
-	.priv = (_e) \
-}
-
-/**
- * COUNTER_COUNT_ENUM_AVAILABLE() - Initialize Count enum available extension
- * @_name:	Attribute name ("_available" will be appended to the name)
- * @_e:		Pointer to a counter_count_enum_ext structure
- *
- * Creates a read only attribute that lists all the available enum items in a
- * newline separated list. This should usually be used together with
- * COUNTER_COUNT_ENUM()
- */
-#define COUNTER_COUNT_ENUM_AVAILABLE(_name, _e) \
-{ \
-	.name = (_name "_available"), \
-	.read = counter_count_enum_available_read, \
-	.priv = (_e) \
-}
-
-/**
- * struct counter_device_attr_group - internal container for attribute group
- * @attr_group:	Counter sysfs attributes group
- * @attr_list:	list to keep track of created Counter sysfs attributes
- * @num_attr:	number of Counter sysfs attributes
- */
-struct counter_device_attr_group {
-	struct attribute_group attr_group;
-	struct list_head attr_list;
-	size_t num_attr;
-};
-
-/**
- * struct counter_device_state - internal state container for a Counter device
- * @id:			unique ID used to identify the Counter
- * @dev:		internal device structure
- * @groups_list:	attribute groups list (for Signals, Counts, and ext)
- * @num_groups:		number of attribute groups containers
- * @groups:		Counter sysfs attribute groups (to populate @dev.groups)
- */
-struct counter_device_state {
-	int id;
-	struct device dev;
-	struct counter_device_attr_group *groups_list;
-	size_t num_groups;
-	const struct attribute_group **groups;
-};
-
-enum counter_signal_level {
-	COUNTER_SIGNAL_LEVEL_LOW,
-	COUNTER_SIGNAL_LEVEL_HIGH,
 };
 
 /**
@@ -306,117 +252,47 @@ enum counter_signal_level {
  * @count_write:	optional write callback for Count attribute. The write
  *			value for the respective Count is passed in via the val
  *			parameter.
- * @function_get:	function to get the current count function mode. Returns
- *			0 on success and negative error code on error. The index
- *			of the respective Count's returned function mode should
- *			be passed back via the function parameter.
- * @function_set:	function to set the count function mode. function is the
- *			index of the requested function mode from the respective
- *			Count's functions_list array.
- * @action_get:		function to get the current action mode. Returns 0 on
- *			success and negative error code on error. The index of
- *			the respective Synapse's returned action mode should be
- *			passed back via the action parameter.
- * @action_set:		function to set the action mode. action is the index of
- *			the requested action mode from the respective Synapse's
- *			actions_list array.
+ * @function_read:	read callback the Count function modes. The read
+ *			function mode of the respective Count should be passed
+ *			back via the function parameter.
+ * @function_write:	write callback for Count function modes. The function
+ *			mode to write for the respective Count is passed in via
+ *			the function parameter.
+ * @action_read:	read callback the Synapse action modes. The read action
+ *			mode of the respective Synapse should be passed back via
+ *			the action parameter.
+ * @action_write:	write callback for Synapse action modes. The action mode
+ *			to write for the respective Synapse is passed in via the
+ *			action parameter.
  */
 struct counter_ops {
 	int (*signal_read)(struct counter_device *counter,
 			   struct counter_signal *signal,
 			   enum counter_signal_level *level);
 	int (*count_read)(struct counter_device *counter,
-			  struct counter_count *count, unsigned long *val);
+			  struct counter_count *count, u64 *value);
 	int (*count_write)(struct counter_device *counter,
-			   struct counter_count *count, unsigned long val);
-	int (*function_get)(struct counter_device *counter,
-			    struct counter_count *count, size_t *function);
-	int (*function_set)(struct counter_device *counter,
-			    struct counter_count *count, size_t function);
-	int (*action_get)(struct counter_device *counter,
-			  struct counter_count *count,
-			  struct counter_synapse *synapse, size_t *action);
-	int (*action_set)(struct counter_device *counter,
-			  struct counter_count *count,
-			  struct counter_synapse *synapse, size_t action);
-};
-
-/**
- * struct counter_device_ext - Counter device extension
- * @name:	attribute name
- * @read:	read callback for this attribute; may be NULL
- * @write:	write callback for this attribute; may be NULL
- * @priv:	data private to the driver
- */
-struct counter_device_ext {
-	const char *name;
-	ssize_t (*read)(struct counter_device *counter, void *priv, char *buf);
-	ssize_t (*write)(struct counter_device *counter, void *priv,
-			 const char *buf, size_t len);
-	void *priv;
+			   struct counter_count *count, u64 value);
+	int (*function_read)(struct counter_device *counter,
+			     struct counter_count *count,
+			     enum counter_function *function);
+	int (*function_write)(struct counter_device *counter,
+			      struct counter_count *count,
+			      enum counter_function function);
+	int (*action_read)(struct counter_device *counter,
+			   struct counter_count *count,
+			   struct counter_synapse *synapse,
+			   enum counter_synapse_action *action);
+	int (*action_write)(struct counter_device *counter,
+			    struct counter_count *count,
+			    struct counter_synapse *synapse,
+			    enum counter_synapse_action action);
 };
 
-/**
- * struct counter_device_enum_ext - Counter enum extension attribute
- * @items:	Array of strings
- * @num_items:	Number of items specified in @items
- * @set:	Set callback function; may be NULL
- * @get:	Get callback function; may be NULL
- *
- * The counter_device_enum_ext structure can be used to implement enum style
- * Counter extension attributes. Enum style attributes are those which have a
- * set of strings that map to unsigned integer values. The Generic Counter enum
- * extension helper code takes care of mapping between value and string, as well
- * as generating a "_available" file which contains a list of all available
- * items. The get callback is used to query the currently active item; the index
- * of the item within the respective items array is returned via the 'item'
- * parameter. The set callback is called when the attribute is updated; the
- * 'item' parameter contains the index of the newly activated item within the
- * respective items array.
- */
-struct counter_device_enum_ext {
-	const char * const *items;
-	size_t num_items;
-	int (*get)(struct counter_device *counter, size_t *item);
-	int (*set)(struct counter_device *counter, size_t item);
-};
-
-/**
- * COUNTER_DEVICE_ENUM() - Initialize Counter enum extension
- * @_name:	Attribute name
- * @_e:		Pointer to a counter_device_enum_ext structure
- *
- * This should usually be used together with COUNTER_DEVICE_ENUM_AVAILABLE()
- */
-#define COUNTER_DEVICE_ENUM(_name, _e) \
-{ \
-	.name = (_name), \
-	.read = counter_device_enum_read, \
-	.write = counter_device_enum_write, \
-	.priv = (_e) \
-}
-
-/**
- * COUNTER_DEVICE_ENUM_AVAILABLE() - Initialize Counter enum available extension
- * @_name:	Attribute name ("_available" will be appended to the name)
- * @_e:		Pointer to a counter_device_enum_ext structure
- *
- * Creates a read only attribute that lists all the available enum items in a
- * newline separated list. This should usually be used together with
- * COUNTER_DEVICE_ENUM()
- */
-#define COUNTER_DEVICE_ENUM_AVAILABLE(_name, _e) \
-{ \
-	.name = (_name "_available"), \
-	.read = counter_device_enum_available_read, \
-	.priv = (_e) \
-}
-
 /**
  * struct counter_device - Counter data structure
  * @name:		name of the device as it appears in the datasheet
  * @parent:		optional parent device providing the counters
- * @device_state:	internal device state container
  * @ops:		callbacks from driver
  * @signals:		array of Signals
  * @num_signals:	number of Signals specified in @signals
@@ -425,11 +301,11 @@ struct counter_device_enum_ext {
  * @ext:		optional array of Counter device extensions
  * @num_ext:		number of Counter device extensions specified in @ext
  * @priv:		optional private data supplied by driver
+ * @dev:		internal device structure
  */
 struct counter_device {
 	const char *name;
 	struct device *parent;
-	struct counter_device_state *device_state;
 
 	const struct counter_ops *ops;
 
@@ -438,17 +314,159 @@ struct counter_device {
 	struct counter_count *counts;
 	size_t num_counts;
 
-	const struct counter_device_ext *ext;
+	struct counter_comp *ext;
 	size_t num_ext;
 
 	void *priv;
+
+	struct device dev;
 };
 
 int counter_register(struct counter_device *const counter);
 void counter_unregister(struct counter_device *const counter);
 int devm_counter_register(struct device *dev,
 			  struct counter_device *const counter);
-void devm_counter_unregister(struct device *dev,
-			     struct counter_device *const counter);
+
+#define COUNTER_COMP_DEVICE_U8(_name, _read, _write) \
+{ \
+	.type = COUNTER_COMP_U8, \
+	.name = (_name), \
+	.device_u8_read = (_read), \
+	.device_u8_write = (_write), \
+}
+#define COUNTER_COMP_COUNT_U8(_name, _read, _write) \
+{ \
+	.type = COUNTER_COMP_U8, \
+	.name = (_name), \
+	.count_u8_read = (_read), \
+	.count_u8_write = (_write), \
+}
+#define COUNTER_COMP_SIGNAL_U8(_name, _read, _write) \
+{ \
+	.type = COUNTER_COMP_U8, \
+	.name = (_name), \
+	.signal_u8_read = (_read), \
+	.signal_u8_write = (_write), \
+}
+
+#define COUNTER_COMP_DEVICE_U64(_name, _read, _write) \
+{ \
+	.type = COUNTER_COMP_U64, \
+	.name = (_name), \
+	.device_u64_read = (_read), \
+	.device_u64_write = (_write), \
+}
+#define COUNTER_COMP_COUNT_U64(_name, _read, _write) \
+{ \
+	.type = COUNTER_COMP_U64, \
+	.name = (_name), \
+	.count_u64_read = (_read), \
+	.count_u64_write = (_write), \
+}
+#define COUNTER_COMP_SIGNAL_U64(_name, _read, _write) \
+{ \
+	.type = COUNTER_COMP_U64, \
+	.name = (_name), \
+	.signal_u64_read = (_read), \
+	.signal_u64_write = (_write), \
+}
+
+#define COUNTER_COMP_DEVICE_BOOL(_name, _read, _write) \
+{ \
+	.type = COUNTER_COMP_BOOL, \
+	.name = (_name), \
+	.device_u8_read = (_read), \
+	.device_u8_write = (_write), \
+}
+#define COUNTER_COMP_COUNT_BOOL(_name, _read, _write) \
+{ \
+	.type = COUNTER_COMP_BOOL, \
+	.name = (_name), \
+	.count_u8_read = (_read), \
+	.count_u8_write = (_write), \
+}
+#define COUNTER_COMP_SIGNAL_BOOL(_name, _read, _write) \
+{ \
+	.type = COUNTER_COMP_BOOL, \
+	.name = (_name), \
+	.signal_u8_read = (_read), \
+	.signal_u8_write = (_write), \
+}
+
+struct counter_available {
+	union {
+		const u32 *enums;
+		const char *const *strs;
+	};
+	size_t num_items;
+};
+
+#define DEFINE_COUNTER_AVAILABLE(_name, _enums) \
+	struct counter_available _name = { \
+		.enums = (_enums), \
+		.num_items = ARRAY_SIZE(_enums), \
+	}
+
+#define DEFINE_COUNTER_ENUM(_name, _strs) \
+	struct counter_available _name = { \
+		.strs = (_strs), \
+		.num_items = ARRAY_SIZE(_strs), \
+	}
+
+#define COUNTER_COMP_DEVICE_ENUM(_name, _get, _set, _available) \
+{ \
+	.type = COUNTER_COMP_ENUM, \
+	.name = (_name), \
+	.device_u32_read = (_get), \
+	.device_u32_write = (_set), \
+	.priv = &(_available), \
+}
+#define COUNTER_COMP_COUNT_ENUM(_name, _get, _set, _available) \
+{ \
+	.type = COUNTER_COMP_ENUM, \
+	.name = (_name), \
+	.count_u32_read = (_get), \
+	.count_u32_write = (_set), \
+	.priv = &(_available), \
+}
+#define COUNTER_COMP_SIGNAL_ENUM(_name, _get, _set, _available) \
+{ \
+	.type = COUNTER_COMP_ENUM, \
+	.name = (_name), \
+	.signal_u32_read = (_get), \
+	.signal_u32_write = (_set), \
+	.priv = &(_available), \
+}
+
+#define COUNTER_COMP_CEILING(_read, _write) \
+	COUNTER_COMP_COUNT_U64("ceiling", _read, _write)
+
+#define COUNTER_COMP_COUNT_MODE(_read, _write, _available) \
+{ \
+	.type = COUNTER_COMP_COUNT_MODE, \
+	.name = "count_mode", \
+	.count_u32_read = (_read), \
+	.count_u32_write = (_write), \
+	.priv = &(_available), \
+}
+
+#define COUNTER_COMP_DIRECTION(_read) \
+{ \
+	.type = COUNTER_COMP_COUNT_DIRECTION, \
+	.name = "direction", \
+	.count_u32_read = (_read), \
+}
+
+#define COUNTER_COMP_ENABLE(_read, _write) \
+	COUNTER_COMP_COUNT_BOOL("enable", _read, _write)
+
+#define COUNTER_COMP_FLOOR(_read, _write) \
+	COUNTER_COMP_COUNT_U64("floor", _read, _write)
+
+#define COUNTER_COMP_PRESET(_read, _write) \
+	COUNTER_COMP_COUNT_U64("preset", _read, _write)
+
+#define COUNTER_COMP_PRESET_ENABLE(_read, _write) \
+	COUNTER_COMP_COUNT_BOOL("preset_enable", _read, _write)
 
 #endif /* _COUNTER_H_ */
diff --git a/include/linux/counter_enum.h b/include/linux/counter_enum.h
deleted file mode 100644
index 9f917298a88f..000000000000
--- a/include/linux/counter_enum.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Counter interface enum functions
- * Copyright (C) 2018 William Breathitt Gray
- */
-#ifndef _COUNTER_ENUM_H_
-#define _COUNTER_ENUM_H_
-
-#include <linux/types.h>
-
-struct counter_device;
-struct counter_signal;
-struct counter_count;
-
-ssize_t counter_signal_enum_read(struct counter_device *counter,
-				 struct counter_signal *signal, void *priv,
-				 char *buf);
-ssize_t counter_signal_enum_write(struct counter_device *counter,
-				  struct counter_signal *signal, void *priv,
-				  const char *buf, size_t len);
-
-ssize_t counter_signal_enum_available_read(struct counter_device *counter,
-					   struct counter_signal *signal,
-					   void *priv, char *buf);
-
-ssize_t counter_count_enum_read(struct counter_device *counter,
-				struct counter_count *count, void *priv,
-				char *buf);
-ssize_t counter_count_enum_write(struct counter_device *counter,
-				 struct counter_count *count, void *priv,
-				 const char *buf, size_t len);
-
-ssize_t counter_count_enum_available_read(struct counter_device *counter,
-					  struct counter_count *count,
-					  void *priv, char *buf);
-
-ssize_t counter_device_enum_read(struct counter_device *counter, void *priv,
-				 char *buf);
-ssize_t counter_device_enum_write(struct counter_device *counter, void *priv,
-				  const char *buf, size_t len);
-
-ssize_t counter_device_enum_available_read(struct counter_device *counter,
-					   void *priv, char *buf);
-
-#endif /* _COUNTER_ENUM_H_ */
-- 
cgit v1.2.3


From 712392f558ef4280b67659fb2d52854dcfc7c8ba Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Fri, 27 Aug 2021 12:47:48 +0900
Subject: counter: Update counter.h comments to reflect sysfs internalization

The Counter subsystem architecture and driver implementations have
changed in order to handle Counter sysfs interactions in a more
consistent way. This patch updates the Generic Counter interface
header file comments to reflect the changes.

Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Link: https://lore.kernel.org/r/19da8ae0c05381b0967c8a334b67f86b814eb880.1630031207.git.vilhelm.gray@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/counter.h | 62 ++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index b69277f5c4c5..445f22d8bfe2 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -188,11 +188,10 @@ struct counter_comp {
 
 /**
  * struct counter_signal - Counter Signal node
- * @id:		unique ID used to identify signal
- * @name:	device-specific Signal name; ideally, this should match the name
- *		as it appears in the datasheet documentation
- * @ext:	optional array of Counter Signal extensions
- * @num_ext:	number of Counter Signal extensions specified in @ext
+ * @id:		unique ID used to identify the Signal
+ * @name:	device-specific Signal name
+ * @ext:	optional array of Signal extensions
+ * @num_ext:	number of Signal extensions specified in @ext
  */
 struct counter_signal {
 	int id;
@@ -206,7 +205,7 @@ struct counter_signal {
  * struct counter_synapse - Counter Synapse node
  * @actions_list:	array of available action modes
  * @num_actions:	number of action modes specified in @actions_list
- * @signal:		pointer to associated signal
+ * @signal:		pointer to the associated Signal
  */
 struct counter_synapse {
 	const enum counter_synapse_action *actions_list;
@@ -217,15 +216,14 @@ struct counter_synapse {
 
 /**
  * struct counter_count - Counter Count node
- * @id:			unique ID used to identify Count
- * @name:		device-specific Count name; ideally, this should match
- *			the name as it appears in the datasheet documentation
- * @functions_list:	array available function modes
+ * @id:			unique ID used to identify the Count
+ * @name:		device-specific Count name
+ * @functions_list:	array of available function modes
  * @num_functions:	number of function modes specified in @functions_list
- * @synapses:		array of synapses for initialization
- * @num_synapses:	number of synapses specified in @synapses
- * @ext:		optional array of Counter Count extensions
- * @num_ext:		number of Counter Count extensions specified in @ext
+ * @synapses:		array of Synapses for initialization
+ * @num_synapses:	number of Synapses specified in @synapses
+ * @ext:		optional array of Count extensions
+ * @num_ext:		number of Count extensions specified in @ext
  */
 struct counter_count {
 	int id;
@@ -243,27 +241,27 @@ struct counter_count {
 
 /**
  * struct counter_ops - Callbacks from driver
- * @signal_read:	optional read callback for Signal attribute. The read
- *			level of the respective Signal should be passed back via
- *			the level parameter.
- * @count_read:		optional read callback for Count attribute. The read
- *			value of the respective Count should be passed back via
- *			the val parameter.
- * @count_write:	optional write callback for Count attribute. The write
- *			value for the respective Count is passed in via the val
+ * @signal_read:	optional read callback for Signals. The read level of
+ *			the respective Signal should be passed back via the
+ *			level parameter.
+ * @count_read:		read callback for Counts. The read value of the
+ *			respective Count should be passed back via the value
+ *			parameter.
+ * @count_write:	optional write callback for Counts. The write value for
+ *			the respective Count is passed in via the value
  *			parameter.
  * @function_read:	read callback the Count function modes. The read
  *			function mode of the respective Count should be passed
  *			back via the function parameter.
- * @function_write:	write callback for Count function modes. The function
- *			mode to write for the respective Count is passed in via
- *			the function parameter.
- * @action_read:	read callback the Synapse action modes. The read action
- *			mode of the respective Synapse should be passed back via
- *			the action parameter.
- * @action_write:	write callback for Synapse action modes. The action mode
- *			to write for the respective Synapse is passed in via the
- *			action parameter.
+ * @function_write:	optional write callback for Count function modes. The
+ *			function mode to write for the respective Count is
+ *			passed in via the function parameter.
+ * @action_read:	optional read callback the Synapse action modes. The
+ *			read action mode of the respective Synapse should be
+ *			passed back via the action parameter.
+ * @action_write:	optional write callback for Synapse action modes. The
+ *			action mode to write for the respective Synapse is
+ *			passed in via the action parameter.
  */
 struct counter_ops {
 	int (*signal_read)(struct counter_device *counter,
@@ -291,7 +289,7 @@ struct counter_ops {
 
 /**
  * struct counter_device - Counter data structure
- * @name:		name of the device as it appears in the datasheet
+ * @name:		name of the device
  * @parent:		optional parent device providing the counters
  * @ops:		callbacks from driver
  * @signals:		array of Signals
-- 
cgit v1.2.3


From e65c26f413718ed2e6d788491adcd8cebc0f44b6 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Wed, 29 Sep 2021 12:15:58 +0900
Subject: counter: Move counter enums to uapi header

This is in preparation for a subsequent patch implementing a character
device interface for the Counter subsystem.

Reviewed-by: David Lechner <david@lechnology.com>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Link: https://lore.kernel.org/r/962a5f2027fafcf4f77c10e1baf520463960d1ee.1632884256.git.vilhelm.gray@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/counter.h | 42 +-----------------------------------------
 1 file changed, 1 insertion(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index 445f22d8bfe2..7c9f7e23953a 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -9,6 +9,7 @@
 #include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <uapi/linux/counter.h>
 
 struct counter_device;
 struct counter_count;
@@ -27,47 +28,6 @@ enum counter_comp_type {
 	COUNTER_COMP_COUNT_MODE,
 };
 
-enum counter_scope {
-	COUNTER_SCOPE_DEVICE,
-	COUNTER_SCOPE_SIGNAL,
-	COUNTER_SCOPE_COUNT,
-};
-
-enum counter_count_direction {
-	COUNTER_COUNT_DIRECTION_FORWARD,
-	COUNTER_COUNT_DIRECTION_BACKWARD,
-};
-
-enum counter_count_mode {
-	COUNTER_COUNT_MODE_NORMAL,
-	COUNTER_COUNT_MODE_RANGE_LIMIT,
-	COUNTER_COUNT_MODE_NON_RECYCLE,
-	COUNTER_COUNT_MODE_MODULO_N,
-};
-
-enum counter_function {
-	COUNTER_FUNCTION_INCREASE,
-	COUNTER_FUNCTION_DECREASE,
-	COUNTER_FUNCTION_PULSE_DIRECTION,
-	COUNTER_FUNCTION_QUADRATURE_X1_A,
-	COUNTER_FUNCTION_QUADRATURE_X1_B,
-	COUNTER_FUNCTION_QUADRATURE_X2_A,
-	COUNTER_FUNCTION_QUADRATURE_X2_B,
-	COUNTER_FUNCTION_QUADRATURE_X4,
-};
-
-enum counter_signal_level {
-	COUNTER_SIGNAL_LEVEL_LOW,
-	COUNTER_SIGNAL_LEVEL_HIGH,
-};
-
-enum counter_synapse_action {
-	COUNTER_SYNAPSE_ACTION_NONE,
-	COUNTER_SYNAPSE_ACTION_RISING_EDGE,
-	COUNTER_SYNAPSE_ACTION_FALLING_EDGE,
-	COUNTER_SYNAPSE_ACTION_BOTH_EDGES,
-};
-
 /**
  * struct counter_comp - Counter component node
  * @type:		Counter component data type
-- 
cgit v1.2.3


From b6c50affda5957a3629b149a91c7f6688ffce7f7 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Wed, 29 Sep 2021 12:15:59 +0900
Subject: counter: Add character device interface

This patch introduces a character device interface for the Counter
subsystem. Device data is exposed through standard character device read
operations. Device data is gathered when a Counter event is pushed by
the respective Counter device driver. Configuration is handled via ioctl
operations on the respective Counter character device node.

Cc: David Lechner <david@lechnology.com>
Cc: Gwendal Grignou <gwendal@chromium.org>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Link: https://lore.kernel.org/r/b8b8c64b4065aedff43699ad1f0e2f8d1419c15b.1632884256.git.vilhelm.gray@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/counter.h | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index 7c9f7e23953a..22b14a552b1d 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -6,9 +6,14 @@
 #ifndef _COUNTER_H_
 #define _COUNTER_H_
 
+#include <linux/cdev.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
+#include <linux/kfifo.h>
+#include <linux/mutex.h>
+#include <linux/spinlock_types.h>
 #include <linux/types.h>
+#include <linux/wait.h>
 #include <uapi/linux/counter.h>
 
 struct counter_device;
@@ -199,6 +204,20 @@ struct counter_count {
 	size_t num_ext;
 };
 
+/**
+ * struct counter_event_node - Counter Event node
+ * @l:		list of current watching Counter events
+ * @event:	event that triggers
+ * @channel:	event channel
+ * @comp_list:	list of components to watch when event triggers
+ */
+struct counter_event_node {
+	struct list_head l;
+	u8 event;
+	u8 channel;
+	struct list_head comp_list;
+};
+
 /**
  * struct counter_ops - Callbacks from driver
  * @signal_read:	optional read callback for Signals. The read level of
@@ -222,6 +241,13 @@ struct counter_count {
  * @action_write:	optional write callback for Synapse action modes. The
  *			action mode to write for the respective Synapse is
  *			passed in via the action parameter.
+ * @events_configure:	optional write callback to configure events. The list of
+ *			struct counter_event_node may be accessed via the
+ *			events_list member of the counter parameter.
+ * @watch_validate:	optional callback to validate a watch. The Counter
+ *			component watch configuration is passed in via the watch
+ *			parameter. A return value of 0 indicates a valid Counter
+ *			component watch configuration.
  */
 struct counter_ops {
 	int (*signal_read)(struct counter_device *counter,
@@ -245,6 +271,9 @@ struct counter_ops {
 			    struct counter_count *count,
 			    struct counter_synapse *synapse,
 			    enum counter_synapse_action action);
+	int (*events_configure)(struct counter_device *counter);
+	int (*watch_validate)(struct counter_device *counter,
+			      const struct counter_watch *watch);
 };
 
 /**
@@ -260,6 +289,16 @@ struct counter_ops {
  * @num_ext:		number of Counter device extensions specified in @ext
  * @priv:		optional private data supplied by driver
  * @dev:		internal device structure
+ * @chrdev:		internal character device structure
+ * @events_list:	list of current watching Counter events
+ * @events_list_lock:	lock to protect Counter events list operations
+ * @next_events_list:	list of next watching Counter events
+ * @n_events_list_lock:	lock to protect Counter next events list operations
+ * @events:		queue of detected Counter events
+ * @events_wait:	wait queue to allow blocking reads of Counter events
+ * @events_lock:	lock to protect Counter events queue read operations
+ * @chrdev_lock:	lock to limit chrdev to a single open at a time
+ * @ops_exist_lock:	lock to prevent use during removal
  */
 struct counter_device {
 	const char *name;
@@ -278,12 +317,29 @@ struct counter_device {
 	void *priv;
 
 	struct device dev;
+	struct cdev chrdev;
+	struct list_head events_list;
+	spinlock_t events_list_lock;
+	struct list_head next_events_list;
+	struct mutex n_events_list_lock;
+	DECLARE_KFIFO_PTR(events, struct counter_event);
+	wait_queue_head_t events_wait;
+	struct mutex events_lock;
+	/*
+	 * chrdev_lock is locked by counter_chrdev_open() and unlocked by
+	 * counter_chrdev_release(), so a mutex is not possible here because
+	 * chrdev_lock will invariably be held when returning to user space
+	 */
+	atomic_t chrdev_lock;
+	struct mutex ops_exist_lock;
 };
 
 int counter_register(struct counter_device *const counter);
 void counter_unregister(struct counter_device *const counter);
 int devm_counter_register(struct device *dev,
 			  struct counter_device *const counter);
+void counter_push_event(struct counter_device *const counter, const u8 event,
+			const u8 channel);
 
 #define COUNTER_COMP_DEVICE_U8(_name, _read, _write) \
 { \
-- 
cgit v1.2.3


From 914ff7719e8adaf87131dd7cd0a3520afcf89516 Mon Sep 17 00:00:00 2001
From: Kyung Min Park <kyung.min.park@intel.com>
Date: Thu, 14 Oct 2021 13:38:32 +0800
Subject: iommu/vt-d: Dump DMAR translation structure when DMA fault occurs

When the dmar translation fault happens, the kernel prints a single line
fault reason with corresponding hexadecimal code defined in the Intel VT-d
specification.

Currently, when user wants to debug the translation fault in detail,
debugfs is used for dumping the dmar_translation_struct, which is not
available when the kernel failed to boot.

Dump the DMAR translation structure, pagewalk the IO page table and print
the page table entry when the fault happens.

This takes effect only when CONFIG_DMAR_DEBUG is enabled.

Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Link: https://lore.kernel.org/r/20210815203845.31287-1-kyung.min.park@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20211014053839.727419-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/dmar.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e04436a7ff27..45e903d84733 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -131,6 +131,14 @@ static inline int dmar_res_noop(struct acpi_dmar_header *hdr, void *arg)
 	return 0;
 }
 
+#ifdef CONFIG_DMAR_DEBUG
+void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
+			  unsigned long long addr, u32 pasid);
+#else
+static inline void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
+					unsigned long long addr, u32 pasid) {}
+#endif
+
 #ifdef CONFIG_INTEL_IOMMU
 extern int iommu_detected, no_iommu;
 extern int intel_iommu_init(void);
-- 
cgit v1.2.3


From b34380a6d767c54480a937951e6189a7f9699443 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 14 Oct 2021 13:38:33 +0800
Subject: iommu/vt-d: Remove duplicate identity domain flag

The iommu_domain data structure already has the "type" field to keep the
type of a domain. It's unnecessary to have the DOMAIN_FLAG_STATIC_IDENTITY
flag in the vt-d implementation. This cleans it up with no functionality
change.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210926114535.923263-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20211014053839.727419-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-iommu.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 05a65eb155f7..65b15af3cf1a 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -517,9 +517,6 @@ struct context_entry {
 	u64 hi;
 };
 
-/* si_domain contains mulitple devices */
-#define DOMAIN_FLAG_STATIC_IDENTITY		BIT(0)
-
 /*
  * When VT-d works in the scalable mode, it allows DMA translation to
  * happen through either first level or second level page table. This
-- 
cgit v1.2.3


From 37c8041a818dec2cea22ea48a3f90d512a9e1fcd Mon Sep 17 00:00:00 2001
From: "Longpeng(Mike)" <longpeng2@huawei.com>
Date: Thu, 14 Oct 2021 13:38:38 +0800
Subject: iommu/vt-d: Convert the return type of first_pte_in_page to bool

The first_pte_in_page() returns true or false, so let's convert its
return type to bool. In addition, use 'IS_ALIGNED' to make the
code more readable and neater.

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
Link: https://lore.kernel.org/r/20211008000433.1115-1-longpeng2@huawei.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20211014053839.727419-9-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-iommu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 65b15af3cf1a..52481625838c 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -705,9 +705,9 @@ static inline bool dma_pte_superpage(struct dma_pte *pte)
 	return (pte->val & DMA_PTE_LARGE_PAGE);
 }
 
-static inline int first_pte_in_page(struct dma_pte *pte)
+static inline bool first_pte_in_page(struct dma_pte *pte)
 {
-	return !((unsigned long)pte & ~VTD_PAGE_MASK);
+	return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
 }
 
 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
-- 
cgit v1.2.3


From 9906b9352a35427508d974db3a496eb3c49e2010 Mon Sep 17 00:00:00 2001
From: "Longpeng(Mike)" <longpeng2@huawei.com>
Date: Thu, 14 Oct 2021 13:38:39 +0800
Subject: iommu/vt-d: Avoid duplicate removing in __domain_mapping()

The __domain_mapping() always removes the pages in the range from
'iov_pfn' to 'end_pfn', but the 'end_pfn' is always the last pfn
of the range that the caller wants to map.

This would introduce too many duplicated removing and leads the
map operation take too long, for example:

  Map iova=0x100000,nr_pages=0x7d61800
    iov_pfn: 0x100000, end_pfn: 0x7e617ff
    iov_pfn: 0x140000, end_pfn: 0x7e617ff
    iov_pfn: 0x180000, end_pfn: 0x7e617ff
    iov_pfn: 0x1c0000, end_pfn: 0x7e617ff
    iov_pfn: 0x200000, end_pfn: 0x7e617ff
    ...
  it takes about 50ms in total.

We can reduce the cost by recalculate the 'end_pfn' and limit it
to the boundary of the end of this pte page.

  Map iova=0x100000,nr_pages=0x7d61800
    iov_pfn: 0x100000, end_pfn: 0x13ffff
    iov_pfn: 0x140000, end_pfn: 0x17ffff
    iov_pfn: 0x180000, end_pfn: 0x1bffff
    iov_pfn: 0x1c0000, end_pfn: 0x1fffff
    iov_pfn: 0x200000, end_pfn: 0x23ffff
    ...
  it only need 9ms now.

This also removes a meaningless BUG_ON() in __domain_mapping().

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
Tested-by: Liujunjie <liujunjie23@huawei.com>
Link: https://lore.kernel.org/r/20211008000433.1115-1-longpeng2@huawei.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20211014053839.727419-10-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-iommu.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 52481625838c..69230fd695ea 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -710,6 +710,12 @@ static inline bool first_pte_in_page(struct dma_pte *pte)
 	return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
 }
 
+static inline int nr_pte_to_next_page(struct dma_pte *pte)
+{
+	return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
+		(struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
+}
+
 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
 extern int dmar_find_matched_atsr_unit(struct pci_dev *dev);
 
-- 
cgit v1.2.3


From 53c36de0701f2fdda6b1d209dc89e2bad4a9c7b8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 30 Dec 2020 10:21:39 -0500
Subject: mm: Add kmap_local_folio()

This allows us to map a portion of a folio.  Callers can only expect
to access up to the next page boundary.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/highmem-internal.h | 11 +++++++++++
 include/linux/highmem.h          | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 4aa1031d3e4c..0a0b2b09b1b8 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -73,6 +73,12 @@ static inline void *kmap_local_page(struct page *page)
 	return __kmap_local_page_prot(page, kmap_prot);
 }
 
+static inline void *kmap_local_folio(struct folio *folio, size_t offset)
+{
+	struct page *page = folio_page(folio, offset / PAGE_SIZE);
+	return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE;
+}
+
 static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
 {
 	return __kmap_local_page_prot(page, prot);
@@ -171,6 +177,11 @@ static inline void *kmap_local_page(struct page *page)
 	return page_address(page);
 }
 
+static inline void *kmap_local_folio(struct folio *folio, size_t offset)
+{
+	return page_address(&folio->page) + offset;
+}
+
 static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
 {
 	return kmap_local_page(page);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index b4c49f9cc379..27cdd715c5f9 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -96,6 +96,43 @@ static inline void kmap_flush_unused(void);
  */
 static inline void *kmap_local_page(struct page *page);
 
+/**
+ * kmap_local_folio - Map a page in this folio for temporary usage
+ * @folio: The folio containing the page.
+ * @offset: The byte offset within the folio which identifies the page.
+ *
+ * Requires careful handling when nesting multiple mappings because the map
+ * management is stack based. The unmap has to be in the reverse order of
+ * the map operation::
+ *
+ *   addr1 = kmap_local_folio(folio1, offset1);
+ *   addr2 = kmap_local_folio(folio2, offset2);
+ *   ...
+ *   kunmap_local(addr2);
+ *   kunmap_local(addr1);
+ *
+ * Unmapping addr1 before addr2 is invalid and causes malfunction.
+ *
+ * Contrary to kmap() mappings the mapping is only valid in the context of
+ * the caller and cannot be handed to other contexts.
+ *
+ * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
+ * virtual address of the direct mapping. Only real highmem pages are
+ * temporarily mapped.
+ *
+ * While it is significantly faster than kmap() for the higmem case it
+ * comes with restrictions about the pointer validity. Only use when really
+ * necessary.
+ *
+ * On HIGHMEM enabled systems mapping a highmem page has the side effect of
+ * disabling migration in order to keep the virtual address stable across
+ * preemption. No caller of kmap_local_folio() can rely on this side effect.
+ *
+ * Context: Can be invoked from any context.
+ * Return: The virtual address of @offset.
+ */
+static inline void *kmap_local_folio(struct folio *folio, size_t offset);
+
 /**
  * kmap_atomic - Atomically map a page for temporary usage - Deprecated!
  * @page:	Pointer to the page to be mapped
-- 
cgit v1.2.3


From b424de33c42dbd03e39ca8f521126d39acb6c335 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 24 Apr 2021 14:50:36 -0400
Subject: mm: Add arch_make_folio_accessible()

As a default implementation, call arch_make_page_accessible n times.
If an architecture can do better, it can override this.

Also move the default implementation of arch_make_page_accessible()
from gfp.h to mm.h.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/gfp.h |  6 ------
 include/linux/mm.h  | 23 +++++++++++++++++++++++
 2 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 55b2ec1f965a..dc5ff40608ce 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -520,12 +520,6 @@ static inline void arch_free_page(struct page *page, int order) { }
 #ifndef HAVE_ARCH_ALLOC_PAGE
 static inline void arch_alloc_page(struct page *page, int order) { }
 #endif
-#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
-static inline int arch_make_page_accessible(struct page *page)
-{
-	return 0;
-}
-#endif
 
 struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 		nodemask_t *nodemask);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 47143f3e7f0a..c28cace6fe6b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1734,6 +1734,29 @@ static inline size_t folio_size(struct folio *folio)
 	return PAGE_SIZE << folio_order(folio);
 }
 
+#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
+static inline int arch_make_page_accessible(struct page *page)
+{
+	return 0;
+}
+#endif
+
+#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
+static inline int arch_make_folio_accessible(struct folio *folio)
+{
+	int ret;
+	long i, nr = folio_nr_pages(folio);
+
+	for (i = 0; i < nr; i++) {
+		ret = arch_make_page_accessible(folio_page(folio, i));
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+#endif
+
 /*
  * Some inline functions in vmstat.h depend on page_zone()
  */
-- 
cgit v1.2.3


From 35a020ba0802f732ba070e03e50b140aea4373c4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 27 Apr 2021 09:00:12 -0400
Subject: mm: Add folio_young and folio_idle

Idle page tracking is handled through page_ext on 32-bit architectures.
Add folio equivalents for 32-bit and move all the page compatibility
parts to common code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 include/linux/page_idle.h | 99 +++++++++++++++++++++++------------------------
 1 file changed, 49 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index d8a6aecf99cb..83abf95e9fa7 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -8,46 +8,16 @@
 
 #ifdef CONFIG_PAGE_IDLE_FLAG
 
-#ifdef CONFIG_64BIT
-static inline bool page_is_young(struct page *page)
-{
-	return PageYoung(page);
-}
-
-static inline void set_page_young(struct page *page)
-{
-	SetPageYoung(page);
-}
-
-static inline bool test_and_clear_page_young(struct page *page)
-{
-	return TestClearPageYoung(page);
-}
-
-static inline bool page_is_idle(struct page *page)
-{
-	return PageIdle(page);
-}
-
-static inline void set_page_idle(struct page *page)
-{
-	SetPageIdle(page);
-}
-
-static inline void clear_page_idle(struct page *page)
-{
-	ClearPageIdle(page);
-}
-#else /* !CONFIG_64BIT */
+#ifndef CONFIG_64BIT
 /*
  * If there is not enough space to store Idle and Young bits in page flags, use
  * page ext flags instead.
  */
 extern struct page_ext_operations page_idle_ops;
 
-static inline bool page_is_young(struct page *page)
+static inline bool folio_test_young(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(page);
+	struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
 	if (unlikely(!page_ext))
 		return false;
@@ -55,9 +25,9 @@ static inline bool page_is_young(struct page *page)
 	return test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
-static inline void set_page_young(struct page *page)
+static inline void folio_set_young(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(page);
+	struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
 	if (unlikely(!page_ext))
 		return;
@@ -65,9 +35,9 @@ static inline void set_page_young(struct page *page)
 	set_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
-static inline bool test_and_clear_page_young(struct page *page)
+static inline bool folio_test_clear_young(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(page);
+	struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
 	if (unlikely(!page_ext))
 		return false;
@@ -75,9 +45,9 @@ static inline bool test_and_clear_page_young(struct page *page)
 	return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
-static inline bool page_is_idle(struct page *page)
+static inline bool folio_test_idle(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(page);
+	struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
 	if (unlikely(!page_ext))
 		return false;
@@ -85,9 +55,9 @@ static inline bool page_is_idle(struct page *page)
 	return test_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
 
-static inline void set_page_idle(struct page *page)
+static inline void folio_set_idle(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(page);
+	struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
 	if (unlikely(!page_ext))
 		return;
@@ -95,46 +65,75 @@ static inline void set_page_idle(struct page *page)
 	set_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
 
-static inline void clear_page_idle(struct page *page)
+static inline void folio_clear_idle(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(page);
+	struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
 	if (unlikely(!page_ext))
 		return;
 
 	clear_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
-#endif /* CONFIG_64BIT */
+#endif /* !CONFIG_64BIT */
 
 #else /* !CONFIG_PAGE_IDLE_FLAG */
 
-static inline bool page_is_young(struct page *page)
+static inline bool folio_test_young(struct folio *folio)
 {
 	return false;
 }
 
-static inline void set_page_young(struct page *page)
+static inline void folio_set_young(struct folio *folio)
 {
 }
 
-static inline bool test_and_clear_page_young(struct page *page)
+static inline bool folio_test_clear_young(struct folio *folio)
 {
 	return false;
 }
 
-static inline bool page_is_idle(struct page *page)
+static inline bool folio_test_idle(struct folio *folio)
 {
 	return false;
 }
 
-static inline void set_page_idle(struct page *page)
+static inline void folio_set_idle(struct folio *folio)
 {
 }
 
-static inline void clear_page_idle(struct page *page)
+static inline void folio_clear_idle(struct folio *folio)
 {
 }
 
 #endif /* CONFIG_PAGE_IDLE_FLAG */
 
+static inline bool page_is_young(struct page *page)
+{
+	return folio_test_young(page_folio(page));
+}
+
+static inline void set_page_young(struct page *page)
+{
+	folio_set_young(page_folio(page));
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+	return folio_test_clear_young(page_folio(page));
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+	return folio_test_idle(page_folio(page));
+}
+
+static inline void set_page_idle(struct page *page)
+{
+	folio_set_idle(page_folio(page));
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+	folio_clear_idle(page_folio(page));
+}
 #endif /* _LINUX_MM_PAGE_IDLE_H */
-- 
cgit v1.2.3


From 76580b6529db622964b4ffee59ded25efcb73748 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 27 Apr 2021 10:47:39 -0400
Subject: mm/swap: Add folio_mark_accessed()

Convert mark_page_accessed() to folio_mark_accessed().  It already
operated on the entire compound page, but now we can avoid calling
compound_head quite so many times.  Shrinks the function from 424 bytes
to 295 bytes (shrinking by 129 bytes).  The compatibility wrapper is 30
bytes, plus the 8 bytes for the exported symbol means the kernel shrinks
by 91 bytes.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/swap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0fc84797623f..067b7af5242c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -352,7 +352,8 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file,
 			  unsigned int nr_pages);
 extern void lru_note_cost_page(struct page *);
 extern void lru_cache_add(struct page *);
-extern void mark_page_accessed(struct page *);
+void mark_page_accessed(struct page *);
+void folio_mark_accessed(struct folio *);
 
 extern atomic_t lru_disable_count;
 
-- 
cgit v1.2.3


From d9c08e2232fbca4fa56b9350f7f84835c4aa3add Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 28 Feb 2021 16:02:57 -0500
Subject: mm/rmap: Add folio_mkclean()

Transform page_mkclean() into folio_mkclean() and add a page_mkclean()
wrapper around folio_mkclean().

folio_mkclean is 15 bytes smaller than page_mkclean, but the kernel
is enlarged by 33 bytes due to inlining page_folio() into each caller.
This will go away once the callers are converted to use folio_mkclean().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/rmap.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c976cc6de257..e704b1a4c06c 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -235,7 +235,7 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
  *
  * returns the number of cleaned PTEs.
  */
-int page_mkclean(struct page *);
+int folio_mkclean(struct folio *);
 
 /*
  * called in munlock()/munmap() path to check for other vmas holding
@@ -295,12 +295,14 @@ static inline void try_to_unmap(struct page *page, enum ttu_flags flags)
 {
 }
 
-static inline int page_mkclean(struct page *page)
+static inline int folio_mkclean(struct folio *folio)
 {
 	return 0;
 }
-
-
 #endif	/* CONFIG_MMU */
 
+static inline int page_mkclean(struct page *page)
+{
+	return folio_mkclean(page_folio(page));
+}
 #endif	/* _LINUX_RMAP_H */
-- 
cgit v1.2.3


From 3417013e0d183be9b42d794082eec0ec1c5b5f15 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 7 May 2021 07:28:40 -0400
Subject: mm/migrate: Add folio_migrate_mapping()

Reimplement migrate_page_move_mapping() as a wrapper around
folio_migrate_mapping().  Saves 193 bytes of kernel text.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/migrate.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index c8077e936691..58b3af645e20 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -57,6 +57,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page, int extra_count);
+int folio_migrate_mapping(struct address_space *mapping,
+		struct folio *newfolio, struct folio *folio, int extra_count);
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-- 
cgit v1.2.3


From 19138349ed59b90ce58aca319b873eca2e04ad43 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 7 May 2021 15:26:29 -0400
Subject: mm/migrate: Add folio_migrate_flags()

Turn migrate_page_states() into a wrapper around folio_migrate_flags().
Also convert two functions only called from folio_migrate_flags() to
be folio-based.  ksm_migrate_page() becomes folio_migrate_ksm() and
copy_page_owner() becomes folio_copy_owner().  folio_migrate_flags()
alone shrinks by two thirds -- 1967 bytes down to 642 bytes.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/ksm.h        | 4 ++--
 include/linux/migrate.h    | 1 +
 include/linux/page_owner.h | 8 ++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 161e8164abcf..a38a5bca1ba5 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -52,7 +52,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
 			struct vm_area_struct *vma, unsigned long address);
 
 void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
-void ksm_migrate_page(struct page *newpage, struct page *oldpage);
+void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
 
 #else  /* !CONFIG_KSM */
 
@@ -83,7 +83,7 @@ static inline void rmap_walk_ksm(struct page *page,
 {
 }
 
-static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old)
 {
 }
 #endif /* CONFIG_MMU */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 58b3af645e20..aa875b83f7ba 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -57,6 +57,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page, int extra_count);
+void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
 int folio_migrate_mapping(struct address_space *mapping,
 		struct folio *newfolio, struct folio *folio, int extra_count);
 #else
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 719bfe5108c5..43c638c51c1f 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -12,7 +12,7 @@ extern void __reset_page_owner(struct page *page, unsigned int order);
 extern void __set_page_owner(struct page *page,
 			unsigned int order, gfp_t gfp_mask);
 extern void __split_page_owner(struct page *page, unsigned int nr);
-extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
+extern void __folio_copy_owner(struct folio *newfolio, struct folio *old);
 extern void __set_page_owner_migrate_reason(struct page *page, int reason);
 extern void __dump_page_owner(const struct page *page);
 extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -36,10 +36,10 @@ static inline void split_page_owner(struct page *page, unsigned int nr)
 	if (static_branch_unlikely(&page_owner_inited))
 		__split_page_owner(page, nr);
 }
-static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+static inline void folio_copy_owner(struct folio *newfolio, struct folio *old)
 {
 	if (static_branch_unlikely(&page_owner_inited))
-		__copy_page_owner(oldpage, newpage);
+		__folio_copy_owner(newfolio, old);
 }
 static inline void set_page_owner_migrate_reason(struct page *page, int reason)
 {
@@ -63,7 +63,7 @@ static inline void split_page_owner(struct page *page,
 			unsigned int order)
 {
 }
-static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio)
 {
 }
 static inline void set_page_owner_migrate_reason(struct page *page, int reason)
-- 
cgit v1.2.3


From 715cbfd6c5c595bc8b7a6f9ad1fe9fec0122bb20 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 7 May 2021 15:05:06 -0400
Subject: mm/migrate: Add folio_migrate_copy()

This is the folio equivalent of migrate_page_copy(), which is retained
as a wrapper for filesystems which are not yet converted to folios.
Also convert copy_huge_page() to folio_copy().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/migrate.h | 1 +
 include/linux/mm.h      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index aa875b83f7ba..0d2aeb9b0f66 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -58,6 +58,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 extern int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page, int extra_count);
 void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
+void folio_migrate_copy(struct folio *newfolio, struct folio *folio);
 int folio_migrate_mapping(struct address_space *mapping,
 		struct folio *newfolio, struct folio *folio, int extra_count);
 #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c28cace6fe6b..93d5fbe2e4e3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -912,7 +912,7 @@ void __put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
-void copy_huge_page(struct page *dst, struct page *src);
+void folio_copy(struct folio *dst, struct folio *src);
 
 /*
  * Compound pages have a destructor function.  Provide a
-- 
cgit v1.2.3


From bd3488e7b4d61780eb3dfaca1cc6f4026bcffd48 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 19 Mar 2021 08:39:50 -0400
Subject: mm/writeback: Rename __add_wb_stat() to wb_stat_mod()

Make this look like the newly renamed vmstat functions.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/backing-dev.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index ac7f231b8825..a62e72dd829f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -64,7 +64,7 @@ static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
 	return atomic_long_read(&bdi->tot_write_bandwidth);
 }
 
-static inline void __add_wb_stat(struct bdi_writeback *wb,
+static inline void wb_stat_mod(struct bdi_writeback *wb,
 				 enum wb_stat_item item, s64 amount)
 {
 	percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
@@ -72,12 +72,12 @@ static inline void __add_wb_stat(struct bdi_writeback *wb,
 
 static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
-	__add_wb_stat(wb, item, 1);
+	wb_stat_mod(wb, item, 1);
 }
 
 static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
-	__add_wb_stat(wb, item, -1);
+	wb_stat_mod(wb, item, -1);
 }
 
 static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
-- 
cgit v1.2.3


From be5f1797523004d0d9aaee81a523d86e3b890007 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 20 Mar 2021 16:34:54 -0400
Subject: flex_proportions: Allow N events instead of 1

When batching events (such as writing back N pages in a single I/O), it
is better to do one flex_proportion operation instead of N.  There is
only one caller of __fprop_inc_percpu_max(), and it's the one we're
going to change in the next patch, so rename it instead of adding a
compatibility wrapper.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 include/linux/flex_proportions.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h
index c12df59d3f5f..3e378b1fb0bc 100644
--- a/include/linux/flex_proportions.h
+++ b/include/linux/flex_proportions.h
@@ -83,9 +83,10 @@ struct fprop_local_percpu {
 
 int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp);
 void fprop_local_destroy_percpu(struct fprop_local_percpu *pl);
-void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl);
-void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl,
-			    int max_frac);
+void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl,
+		long nr);
+void __fprop_add_percpu_max(struct fprop_global *p,
+		struct fprop_local_percpu *pl, int max_frac, long nr);
 void fprop_fraction_percpu(struct fprop_global *p,
 	struct fprop_local_percpu *pl, unsigned long *numerator,
 	unsigned long *denominator);
@@ -96,7 +97,7 @@ void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__fprop_inc_percpu(p, pl);
+	__fprop_add_percpu(p, pl, 1);
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3


From 269ccca3899f6bce49e004f50f623e0b161fb027 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Jan 2021 23:34:16 -0500
Subject: mm/writeback: Add __folio_end_writeback()

test_clear_page_writeback() is actually an mm-internal function, although
it's named as if it's a pagecache function.  Move it to mm/internal.h,
rename it to __folio_end_writeback() and change the return type to bool.

The conversion from page to folio is mostly about accounting the number
of pages being written back, although it does eliminate a couple of
calls to compound_head().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/page-flags.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 2491e84b8e52..3aebb2060a26 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -657,7 +657,6 @@ static __always_inline void SetPageUptodate(struct page *page)
 
 CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
 
-int test_clear_page_writeback(struct page *page);
 int __test_set_page_writeback(struct page *page, bool keep_write);
 
 #define test_set_page_writeback(page)			\
-- 
cgit v1.2.3


From f143f1ea5a5380b2682e6836fcd24338a2aa82c7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 24 Apr 2021 12:00:48 -0400
Subject: mm/writeback: Add folio_start_writeback()

Rename set_page_writeback() to folio_start_writeback() to match
folio_end_writeback().  Do not bother with wrappers that return void;
callers are perfectly capable of ignoring return values.

Add wrappers for set_page_writeback(), set_page_writeback_keepwrite() and
test_set_page_writeback() for compatibililty with existing filesystems.
The main advantage of this patch is getting the statistics right,
although it does eliminate a couple of calls to compound_head().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/page-flags.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 3aebb2060a26..a68af80649a4 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -657,21 +657,22 @@ static __always_inline void SetPageUptodate(struct page *page)
 
 CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
 
-int __test_set_page_writeback(struct page *page, bool keep_write);
+bool __folio_start_writeback(struct folio *folio, bool keep_write);
+bool set_page_writeback(struct page *page);
 
-#define test_set_page_writeback(page)			\
-	__test_set_page_writeback(page, false)
-#define test_set_page_writeback_keepwrite(page)	\
-	__test_set_page_writeback(page, true)
+#define folio_start_writeback(folio)			\
+	__folio_start_writeback(folio, false)
+#define folio_start_writeback_keepwrite(folio)	\
+	__folio_start_writeback(folio, true)
 
-static inline void set_page_writeback(struct page *page)
+static inline void set_page_writeback_keepwrite(struct page *page)
 {
-	test_set_page_writeback(page);
+	folio_start_writeback_keepwrite(page_folio(page));
 }
 
-static inline void set_page_writeback_keepwrite(struct page *page)
+static inline bool test_set_page_writeback(struct page *page)
 {
-	test_set_page_writeback_keepwrite(page);
+	return set_page_writeback(page);
 }
 
 __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
-- 
cgit v1.2.3


From b5e84594cafb934e023ee7d4ea4208b6c6b65fcc Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 26 Apr 2021 23:53:10 -0400
Subject: mm/writeback: Add folio_mark_dirty()

Reimplement set_page_dirty() as a wrapper around folio_mark_dirty().
There is no change to filesystems as they were already being called
with the compound_head of the page being marked dirty.  We avoid
several calls to compound_head(), both statically (through
using folio_test_dirty() instead of PageDirty() and dynamically by
calling folio_mapping() instead of page_mapping().

Also return bool instead of int to show the range of values actually
returned, and add kernel-doc.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 93d5fbe2e4e3..00510b02ffe1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2008,7 +2008,8 @@ int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
 void account_page_cleaned(struct page *page, struct address_space *mapping,
 			  struct bdi_writeback *wb);
-int set_page_dirty(struct page *page);
+bool folio_mark_dirty(struct folio *folio);
+bool set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 void __cancel_dirty_page(struct page *page);
 static inline void cancel_dirty_page(struct page *page)
-- 
cgit v1.2.3


From 203a31516616111b8eaaf00c16fa3fcaa25e6f81 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 4 May 2021 11:01:10 -0400
Subject: mm/writeback: Add __folio_mark_dirty()

Turn __set_page_dirty() into a wrapper around __folio_mark_dirty().
Convert account_page_dirtied() into folio_account_dirtied() and account
the number of pages in the folio to support multi-page folios.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/memcontrol.h | 5 ++---
 include/linux/pagemap.h    | 7 ++++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7bd78c13d1fa..e34bf0cbdf55 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1615,10 +1615,9 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
 					     struct bdi_writeback *wb);
 
-static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
 						  struct bdi_writeback *wb)
 {
-	struct folio *folio = page_folio(page);
 	if (mem_cgroup_disabled())
 		return;
 
@@ -1643,7 +1642,7 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
 {
 }
 
-static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
 						  struct bdi_writeback *wb)
 {
 }
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bdbd7be67812..aa9a083083df 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -772,8 +772,13 @@ void end_page_writeback(struct page *page);
 void folio_end_writeback(struct folio *folio);
 void wait_for_stable_page(struct page *page);
 void folio_wait_stable(struct folio *folio);
+void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
+static inline void __set_page_dirty(struct page *page,
+		struct address_space *mapping, int warn)
+{
+	__folio_mark_dirty(page_folio(page), mapping, warn);
+}
 
-void __set_page_dirty(struct page *, struct address_space *, int warn);
 int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
 
-- 
cgit v1.2.3


From 85d4d2ebc86f02740c5f5f72ec43cc47d3560720 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 3 May 2021 23:30:44 -0400
Subject: mm/writeback: Add filemap_dirty_folio()

Reimplement __set_page_dirty_nobuffers() as a wrapper around
filemap_dirty_folio().  Eventually folio_mark_dirty() will pass
the folio's mapping to the address space's ->dirty_folio()
operation, so add the parameter to filemap_dirty_folio() now.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/writeback.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d1f65adf6a26..d4f24eba066f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -393,6 +393,7 @@ void writeback_set_ratelimit(void);
 void tag_pages_for_writeback(struct address_space *mapping,
 			     pgoff_t start, pgoff_t end);
 
+bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio);
 void account_page_redirty(struct page *page);
 
 void sb_mark_inode_writeback(struct inode *inode);
-- 
cgit v1.2.3


From fc9b6a538b222eaeb4d1e3f93e716b6626d9b653 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 4 May 2021 16:12:09 -0400
Subject: mm/writeback: Add folio_account_cleaned()

Get the statistics right; compound pages were being accounted as a
single page.  This didn't matter before now as no filesystem which
supported compound pages did writeback.  Also move the declaration
to pagemap.h since this is part of the page cache.  Add a wrapper for
account_page_cleaned().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm.h      | 3 ---
 include/linux/pagemap.h | 7 +++++++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00510b02ffe1..f47af4ca4873 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -39,7 +39,6 @@ struct anon_vma_chain;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
-struct bdi_writeback;
 struct pt_regs;
 
 extern int sysctl_page_lock_unfairness;
@@ -2006,8 +2005,6 @@ extern void do_invalidatepage(struct page *page, unsigned int offset,
 
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
-void account_page_cleaned(struct page *page, struct address_space *mapping,
-			  struct bdi_writeback *wb);
 bool folio_mark_dirty(struct folio *folio);
 bool set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index aa9a083083df..8ba7da513ac4 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -778,6 +778,13 @@ static inline void __set_page_dirty(struct page *page,
 {
 	__folio_mark_dirty(page_folio(page), mapping, warn);
 }
+void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
+			  struct bdi_writeback *wb);
+static inline void account_page_cleaned(struct page *page,
+		struct address_space *mapping, struct bdi_writeback *wb)
+{
+	return folio_account_cleaned(page_folio(page), mapping, wb);
+}
 
 int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
-- 
cgit v1.2.3


From fdaf532a23795e320c47e1caab50a53c202135c5 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 8 Mar 2021 16:43:04 -0500
Subject: mm/writeback: Add folio_cancel_dirty()

Turn __cancel_dirty_page() into __folio_cancel_dirty() and add wrappers.
Move the prototypes into pagemap.h since this is page cache functionality.
Saves 44 bytes of kernel text in total; 33 bytes from __folio_cancel_dirty
and 11 from two callers of cancel_dirty_page().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm.h      |  7 -------
 include/linux/pagemap.h | 11 +++++++++++
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f47af4ca4873..3bc394aafbc0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2008,13 +2008,6 @@ int redirty_page_for_writepage(struct writeback_control *wbc,
 bool folio_mark_dirty(struct folio *folio);
 bool set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
-void __cancel_dirty_page(struct page *page);
-static inline void cancel_dirty_page(struct page *page)
-{
-	/* Avoid atomic ops, locking, etc. when not actually needed. */
-	if (PageDirty(page))
-		__cancel_dirty_page(page);
-}
 int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8ba7da513ac4..8f4a9ba0bfb0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -785,6 +785,17 @@ static inline void account_page_cleaned(struct page *page,
 {
 	return folio_account_cleaned(page_folio(page), mapping, wb);
 }
+void __folio_cancel_dirty(struct folio *folio);
+static inline void folio_cancel_dirty(struct folio *folio)
+{
+	/* Avoid atomic ops, locking, etc. when not actually needed. */
+	if (folio_test_dirty(folio))
+		__folio_cancel_dirty(folio);
+}
+static inline void cancel_dirty_page(struct page *page)
+{
+	folio_cancel_dirty(page_folio(page));
+}
 
 int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
-- 
cgit v1.2.3


From 9350f20a070d27a6c6a292a2b6f6091e7ea90a65 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 28 Feb 2021 16:21:20 -0500
Subject: mm/writeback: Add folio_clear_dirty_for_io()

Transform clear_page_dirty_for_io() into folio_clear_dirty_for_io()
and add a compatibility wrapper.  Also move the declaration to pagemap.h
as this is page cache functionality that doesn't need to be used by the
rest of the kernel.

Increases the size of the kernel by 79 bytes.  While we remove a few
calls to compound_head(), we add a call to folio_nr_pages() to get the
stats correct for the eventual support of multi-page folios.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm.h      | 1 -
 include/linux/pagemap.h | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3bc394aafbc0..dd5c6970ba67 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2008,7 +2008,6 @@ int redirty_page_for_writepage(struct writeback_control *wbc,
 bool folio_mark_dirty(struct folio *folio);
 bool set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
-int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8f4a9ba0bfb0..0dda6459d2d1 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -796,6 +796,8 @@ static inline void cancel_dirty_page(struct page *page)
 {
 	folio_cancel_dirty(page_folio(page));
 }
+bool folio_clear_dirty_for_io(struct folio *folio);
+bool clear_page_dirty_for_io(struct page *page);
 
 int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
-- 
cgit v1.2.3


From 25ff8b15537dfa0e1a62d55cfcc48f3c8bd8a76c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 3 May 2021 10:06:55 -0400
Subject: mm/writeback: Add folio_account_redirty()

Account the number of pages in the folio that we're redirtying.
Turn account_page_dirty() into a wrapper around it.  Also turn
the comment on folio_account_redirty() into kernel-doc and
edit it slightly so it makes sense to its potential callers.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/writeback.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d4f24eba066f..852100bea351 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -394,7 +394,11 @@ void tag_pages_for_writeback(struct address_space *mapping,
 			     pgoff_t start, pgoff_t end);
 
 bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio);
-void account_page_redirty(struct page *page);
+void folio_account_redirty(struct folio *folio);
+static inline void account_page_redirty(struct page *page)
+{
+	folio_account_redirty(page_folio(page));
+}
 
 void sb_mark_inode_writeback(struct inode *inode);
 void sb_clear_inode_writeback(struct inode *inode);
-- 
cgit v1.2.3


From cd78ab11a8810dd297f4751d17cc53e3dce36024 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 2 May 2021 23:22:52 -0400
Subject: mm/writeback: Add folio_redirty_for_writepage()

Reimplement redirty_page_for_writepage() as a wrapper around
folio_redirty_for_writepage().  Account the number of pages in the
folio, add kernel-doc and move the prototype to writeback.h.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm.h        | 4 ----
 include/linux/writeback.h | 2 ++
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd5c6970ba67..23ab040aa65a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -36,9 +36,7 @@
 struct mempolicy;
 struct anon_vma;
 struct anon_vma_chain;
-struct file_ra_state;
 struct user_struct;
-struct writeback_control;
 struct pt_regs;
 
 extern int sysctl_page_lock_unfairness;
@@ -2003,8 +2001,6 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned int offset,
 			      unsigned int length);
 
-int redirty_page_for_writepage(struct writeback_control *wbc,
-				struct page *page);
 bool folio_mark_dirty(struct folio *folio);
 bool set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 852100bea351..3571f383dfb6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -399,6 +399,8 @@ static inline void account_page_redirty(struct page *page)
 {
 	folio_account_redirty(page_folio(page));
 }
+bool folio_redirty_for_writepage(struct writeback_control *, struct folio *);
+bool redirty_page_for_writepage(struct writeback_control *, struct page *);
 
 void sb_mark_inode_writeback(struct inode *inode);
 void sb_clear_inode_writeback(struct inode *inode);
-- 
cgit v1.2.3


From 9eb7c76dd31a53331bec795faaf7daa7ffb80071 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 27 Apr 2021 23:11:28 -0400
Subject: mm/filemap: Add i_blocks_per_folio()

Reimplement i_blocks_per_page() as a wrapper around i_blocks_per_folio().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/pagemap.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0dda6459d2d1..5431d9920dc0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1149,19 +1149,25 @@ static inline int page_mkwrite_check_truncate(struct page *page,
 }
 
 /**
- * i_blocks_per_page - How many blocks fit in this page.
+ * i_blocks_per_folio - How many blocks fit in this folio.
  * @inode: The inode which contains the blocks.
- * @page: The page (head page if the page is a THP).
+ * @folio: The folio.
  *
- * If the block size is larger than the size of this page, return zero.
+ * If the block size is larger than the size of this folio, return zero.
  *
- * Context: The caller should hold a refcount on the page to prevent it
+ * Context: The caller should hold a refcount on the folio to prevent it
  * from being split.
- * Return: The number of filesystem blocks covered by this page.
+ * Return: The number of filesystem blocks covered by this folio.
  */
+static inline
+unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio)
+{
+	return folio_size(folio) >> inode->i_blkbits;
+}
+
 static inline
 unsigned int i_blocks_per_page(struct inode *inode, struct page *page)
 {
-	return thp_size(page) >> inode->i_blkbits;
+	return i_blocks_per_folio(inode, page_folio(page));
 }
 #endif /* _LINUX_PAGEMAP_H */
-- 
cgit v1.2.3


From f705bf84eab2aa3b9842974b8443587727ccb23a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 28 Apr 2021 22:30:06 -0400
Subject: mm/filemap: Add folio_mkwrite_check_truncate()

This is the folio equivalent of page_mkwrite_check_truncate().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 include/linux/pagemap.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 5431d9920dc0..4b74d83c5571 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1120,6 +1120,34 @@ static inline unsigned long dir_pages(struct inode *inode)
 			       PAGE_SHIFT;
 }
 
+/**
+ * folio_mkwrite_check_truncate - check if folio was truncated
+ * @folio: the folio to check
+ * @inode: the inode to check the folio against
+ *
+ * Return: the number of bytes in the folio up to EOF,
+ * or -EFAULT if the folio was truncated.
+ */
+static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio,
+					      struct inode *inode)
+{
+	loff_t size = i_size_read(inode);
+	pgoff_t index = size >> PAGE_SHIFT;
+	size_t offset = offset_in_folio(folio, size);
+
+	if (!folio->mapping)
+		return -EFAULT;
+
+	/* folio is wholly inside EOF */
+	if (folio_next_index(folio) - 1 < index)
+		return folio_size(folio);
+	/* folio is wholly past EOF */
+	if (folio->index > index || !offset)
+		return -EFAULT;
+	/* folio is partially inside EOF */
+	return offset;
+}
+
 /**
  * page_mkwrite_check_truncate - check if page was truncated
  * @page: the page to check
-- 
cgit v1.2.3


From 9bf70167e3c61473b95f40771decc3778bf0fb9f Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 27 Apr 2021 16:37:09 -0400
Subject: mm/filemap: Add readahead_folio()

The pointers stored in the page cache are folios, by definition.
This change comes with a behaviour change -- callers of readahead_folio()
are no longer required to put the page reference themselves.  This matches
how readpage works, rather than matching how readpages used to work.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/pagemap.h | 54 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b74d83c5571..fea9615bab35 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -987,33 +987,57 @@ void page_cache_async_readahead(struct address_space *mapping,
 	page_cache_async_ra(&ractl, page, req_count);
 }
 
+static inline struct folio *__readahead_folio(struct readahead_control *ractl)
+{
+	struct folio *folio;
+
+	BUG_ON(ractl->_batch_count > ractl->_nr_pages);
+	ractl->_nr_pages -= ractl->_batch_count;
+	ractl->_index += ractl->_batch_count;
+
+	if (!ractl->_nr_pages) {
+		ractl->_batch_count = 0;
+		return NULL;
+	}
+
+	folio = xa_load(&ractl->mapping->i_pages, ractl->_index);
+	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+	ractl->_batch_count = folio_nr_pages(folio);
+
+	return folio;
+}
+
 /**
  * readahead_page - Get the next page to read.
- * @rac: The current readahead request.
+ * @ractl: The current readahead request.
  *
  * Context: The page is locked and has an elevated refcount.  The caller
  * should decreases the refcount once the page has been submitted for I/O
  * and unlock the page once all I/O to that page has completed.
  * Return: A pointer to the next page, or %NULL if we are done.
  */
-static inline struct page *readahead_page(struct readahead_control *rac)
+static inline struct page *readahead_page(struct readahead_control *ractl)
 {
-	struct page *page;
+	struct folio *folio = __readahead_folio(ractl);
 
-	BUG_ON(rac->_batch_count > rac->_nr_pages);
-	rac->_nr_pages -= rac->_batch_count;
-	rac->_index += rac->_batch_count;
-
-	if (!rac->_nr_pages) {
-		rac->_batch_count = 0;
-		return NULL;
-	}
+	return &folio->page;
+}
 
-	page = xa_load(&rac->mapping->i_pages, rac->_index);
-	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	rac->_batch_count = thp_nr_pages(page);
+/**
+ * readahead_folio - Get the next folio to read.
+ * @ractl: The current readahead request.
+ *
+ * Context: The folio is locked.  The caller should unlock the folio once
+ * all I/O to that folio has completed.
+ * Return: A pointer to the next folio, or %NULL if we are done.
+ */
+static inline struct folio *readahead_folio(struct readahead_control *ractl)
+{
+	struct folio *folio = __readahead_folio(ractl);
 
-	return page;
+	if (folio)
+		folio_put(folio);
+	return folio;
 }
 
 static inline unsigned int __readahead_batch(struct readahead_control *rac,
-- 
cgit v1.2.3


From 0995d7e568141226f10f8216aa4965e06ab5db8a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 29 Apr 2021 10:27:16 -0400
Subject: mm/workingset: Convert workingset_refault() to take a folio

This nets us 178 bytes of savings from removing calls to compound_head.
The three callers all grow a little, but each of them will be converted
to use folios soon, so that's fine.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/swap.h   | 4 ++--
 include/linux/vmstat.h | 6 ------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 067b7af5242c..892faac942da 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -329,7 +329,7 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio)
 /* linux/mm/workingset.c */
 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
 void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
-void workingset_refault(struct page *page, void *shadow);
+void workingset_refault(struct folio *folio, void *shadow);
 void workingset_activation(struct folio *folio);
 
 /* Only track the nodes of mappings with shadow entries */
@@ -350,7 +350,7 @@ extern unsigned long nr_free_buffer_pages(void);
 /* linux/mm/swap.c */
 extern void lru_note_cost(struct lruvec *lruvec, bool file,
 			  unsigned int nr_pages);
-extern void lru_note_cost_page(struct page *);
+extern void lru_note_cost_folio(struct folio *);
 extern void lru_cache_add(struct page *);
 void mark_page_accessed(struct page *);
 void folio_mark_accessed(struct folio *);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 241bd0f53fb9..bfe38869498d 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -597,12 +597,6 @@ static inline void mod_lruvec_page_state(struct page *page,
 
 #endif /* CONFIG_MEMCG */
 
-static inline void inc_lruvec_state(struct lruvec *lruvec,
-				    enum node_stat_item idx)
-{
-	mod_lruvec_state(lruvec, idx, 1);
-}
-
 static inline void __inc_lruvec_page_state(struct page *page,
 					   enum node_stat_item idx)
 {
-- 
cgit v1.2.3


From 0d31125d2d3267ec349796418a16761bbda20387 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 29 Apr 2021 11:09:31 -0400
Subject: mm/lru: Add folio_add_lru()

Reimplement lru_cache_add() as a wrapper around folio_add_lru().
Saves 159 bytes of kernel text due to removing calls to compound_head().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/swap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 892faac942da..cdf0957a88a4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -351,6 +351,7 @@ extern unsigned long nr_free_buffer_pages(void);
 extern void lru_note_cost(struct lruvec *lruvec, bool file,
 			  unsigned int nr_pages);
 extern void lru_note_cost_folio(struct folio *);
+extern void folio_add_lru(struct folio *);
 extern void lru_cache_add(struct page *);
 void mark_page_accessed(struct page *);
 void folio_mark_accessed(struct folio *);
-- 
cgit v1.2.3


From cc09cb134124a42fbe3bdcebefdc54e286d8f3e5 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 15 Dec 2020 22:55:54 -0500
Subject: mm/page_alloc: Add folio allocation functions

The __folio_alloc(), __folio_alloc_node() and folio_alloc() functions
are mostly for type safety, but they also ensure that the page allocator
allocates a compound page and initialises the deferred list if the page
is large enough to have one.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/gfp.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index dc5ff40608ce..3745efd21cf6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -523,6 +523,8 @@ static inline void arch_alloc_page(struct page *page, int order) { }
 
 struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 		nodemask_t *nodemask);
+struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
+		nodemask_t *nodemask);
 
 unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 				nodemask_t *nodemask, int nr_pages,
@@ -564,6 +566,15 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
 	return __alloc_pages(gfp_mask, order, nid, NULL);
 }
 
+static inline
+struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid)
+{
+	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+	VM_WARN_ON((gfp & __GFP_THISNODE) && !node_online(nid));
+
+	return __folio_alloc(gfp, order, nid, NULL);
+}
+
 /*
  * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
  * prefer the current CPU's closest node. Otherwise node must be valid and
@@ -580,6 +591,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 
 #ifdef CONFIG_NUMA
 struct page *alloc_pages(gfp_t gfp, unsigned int order);
+struct folio *folio_alloc(gfp_t gfp, unsigned order);
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 			struct vm_area_struct *vma, unsigned long addr,
 			int node, bool hugepage);
@@ -590,6 +602,10 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	return alloc_pages_node(numa_node_id(), gfp_mask, order);
 }
+static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
+{
+	return __folio_alloc_node(gfp, order, numa_node_id());
+}
 #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
 	alloc_pages(gfp_mask, order)
 #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
-- 
cgit v1.2.3


From bb3c579e25e5757bc5bac1333f4a56dfebf7cb91 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 15 Dec 2020 23:11:07 -0500
Subject: mm/filemap: Add filemap_alloc_folio

Reimplement __page_cache_alloc as a wrapper around filemap_alloc_folio
to allow filesystems to be converted at our leisure.  Increases
kernel text size by 133 bytes, mostly in cachefiles_read_backing_file().
pagecache_get_page() shrinks by 32 bytes, though.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/pagemap.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index fea9615bab35..51c190ae3b0b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -262,14 +262,19 @@ static inline void *detach_page_private(struct page *page)
 }
 
 #ifdef CONFIG_NUMA
-extern struct page *__page_cache_alloc(gfp_t gfp);
+struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
 #else
-static inline struct page *__page_cache_alloc(gfp_t gfp)
+static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
 {
-	return alloc_pages(gfp, 0);
+	return folio_alloc(gfp, order);
 }
 #endif
 
+static inline struct page *__page_cache_alloc(gfp_t gfp)
+{
+	return &filemap_alloc_folio(gfp, 0)->page;
+}
+
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
 	return __page_cache_alloc(mapping_gfp_mask(x));
-- 
cgit v1.2.3


From 9dd3d069406cea073fc633e77bc59abbfde8c6c4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 8 Dec 2020 08:56:28 -0500
Subject: mm/filemap: Add filemap_add_folio()

Convert __add_to_page_cache_locked() into __filemap_add_folio().
Add an assertion to it that (for !hugetlbfs), the folio is naturally
aligned within the file.  Move the prototype from mm.h to pagemap.h.
Convert add_to_page_cache_lru() into filemap_add_folio().  Add a
compatibility wrapper for unconverted callers.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm.h      |  7 -------
 include/linux/pagemap.h | 10 ++++++++--
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 23ab040aa65a..efcb06fc6116 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -213,13 +213,6 @@ int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
 int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
-/*
- * Any attempt to mark this function as static leads to build failure
- * when CONFIG_DEBUG_INFO_BTF is enabled because __add_to_page_cache_locked()
- * is referred to by BPF code. This must be visible for error injection.
- */
-int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-		pgoff_t index, gfp_t gfp, void **shadowp);
 
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 51c190ae3b0b..a10eb6dc3866 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -876,9 +876,11 @@ static inline int fault_in_pages_readable(const char __user *uaddr, size_t size)
 }
 
 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-				pgoff_t index, gfp_t gfp_mask);
+		pgoff_t index, gfp_t gfp);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
-				pgoff_t index, gfp_t gfp_mask);
+		pgoff_t index, gfp_t gfp);
+int filemap_add_folio(struct address_space *mapping, struct folio *folio,
+		pgoff_t index, gfp_t gfp);
 extern void delete_from_page_cache(struct page *page);
 extern void __delete_from_page_cache(struct page *page, void *shadow);
 void replace_page_cache_page(struct page *old, struct page *new);
@@ -903,6 +905,10 @@ static inline int add_to_page_cache(struct page *page,
 	return error;
 }
 
+/* Must be non-static for BPF error injection */
+int __filemap_add_folio(struct address_space *mapping, struct folio *folio,
+		pgoff_t index, gfp_t gfp, void **shadowp);
+
 /**
  * struct readahead_control - Describes a readahead request.
  *
-- 
cgit v1.2.3


From 3f0c6a07fee6a1c2618a2e12ffb8e9aa24384fde Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 8 Mar 2021 11:45:35 -0500
Subject: mm/filemap: Add filemap_get_folio

filemap_get_folio() is a replacement for find_get_page().
Turn pagecache_get_page() into a wrapper around __filemap_get_folio().
Remove find_lock_head() as this use case is now covered by
filemap_get_folio().

Reduces overall kernel size by 209 bytes.  __filemap_get_folio() is
316 bytes shorter than pagecache_get_page() was, but the new
pagecache_get_page() wrapper is 99 bytes.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/pagemap.h | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a10eb6dc3866..401a90336a2c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -302,8 +302,26 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 #define FGP_HEAD		0x00000080
 #define FGP_ENTRY		0x00000100
 
-struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
-		int fgp_flags, gfp_t cache_gfp_mask);
+struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
+		int fgp_flags, gfp_t gfp);
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
+		int fgp_flags, gfp_t gfp);
+
+/**
+ * filemap_get_folio - Find and get a folio.
+ * @mapping: The address_space to search.
+ * @index: The page index.
+ *
+ * Looks up the page cache entry at @mapping & @index.  If a folio is
+ * present, it is returned with an increased refcount.
+ *
+ * Otherwise, %NULL is returned.
+ */
+static inline struct folio *filemap_get_folio(struct address_space *mapping,
+					pgoff_t index)
+{
+	return __filemap_get_folio(mapping, index, 0, 0);
+}
 
 /**
  * find_get_page - find and get a page reference
@@ -346,25 +364,6 @@ static inline struct page *find_lock_page(struct address_space *mapping,
 	return pagecache_get_page(mapping, index, FGP_LOCK, 0);
 }
 
-/**
- * find_lock_head - Locate, pin and lock a pagecache page.
- * @mapping: The address_space to search.
- * @index: The page index.
- *
- * Looks up the page cache entry at @mapping & @index.  If there is a
- * page cache page, its head page is returned locked and with an increased
- * refcount.
- *
- * Context: May sleep.
- * Return: A struct page which is !PageTail, or %NULL if there is no page
- * in the cache for this index.
- */
-static inline struct page *find_lock_head(struct address_space *mapping,
-					pgoff_t index)
-{
-	return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0);
-}
-
 /**
  * find_or_create_page - locate or add a pagecache page
  * @mapping: the page's address_space
-- 
cgit v1.2.3


From b27652d935f41793c5e229a1e8b3a8bb3afe3cc1 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 24 Dec 2020 12:55:56 -0500
Subject: mm/filemap: Add FGP_STABLE

Allow filemap_get_folio() to wait for writeback to complete (if the
filesystem wants that behaviour).  This is the folio equivalent of
grab_cache_page_write_begin(), which is moved into the folio-compat
file as a reminder to migrate all the code using it.  This paves the
way for getting rid of AOP_FLAG_NOFS once grab_cache_page_write_begin()
is removed.

Kernel grows by 11 bytes.  filemap_get_folio() grows by 33 bytes but
grab_cache_page_write_begin() shrinks by 22 bytes to make up for it.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/pagemap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 401a90336a2c..b68b053e581f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -301,6 +301,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 #define FGP_FOR_MMAP		0x00000040
 #define FGP_HEAD		0x00000080
 #define FGP_ENTRY		0x00000100
+#define FGP_STABLE		0x00000200
 
 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 		int fgp_flags, gfp_t gfp);
-- 
cgit v1.2.3


From 121703c1c817b3c77f61002466d0bfca7e39f25d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 9 Mar 2021 13:48:03 -0500
Subject: mm/writeback: Add folio_write_one

Transform write_one_page() into folio_write_one() and add a compatibility
wrapper.  Also move the declaration to pagemap.h as this is page cache
functionality that doesn't need to be used by the rest of the kernel.

Saves 58 bytes of kernel text.  While folio_write_one() is 101 bytes
smaller than write_one_page(), the inlined call to page_folio() expands
each caller.  There are fewer than ten callers so it doesn't seem worth
putting a wrapper in the core.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 include/linux/mm.h      | 4 ----
 include/linux/pagemap.h | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index efcb06fc6116..40ff114aaf9e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2788,10 +2788,6 @@ extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 		pgoff_t start_pgoff, pgoff_t end_pgoff);
 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
 
-/* mm/page-writeback.c */
-int __must_check write_one_page(struct page *page);
-void task_dirty_inc(struct task_struct *tsk);
-
 extern unsigned long stack_guard_gap;
 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b68b053e581f..013cdc90f5fd 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -803,6 +803,11 @@ static inline void cancel_dirty_page(struct page *page)
 }
 bool folio_clear_dirty_for_io(struct folio *folio);
 bool clear_page_dirty_for_io(struct page *page);
+int __must_check folio_write_one(struct folio *folio);
+static inline int __must_check write_one_page(struct page *page)
+{
+	return folio_write_one(page_folio(page));
+}
 
 int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
-- 
cgit v1.2.3


From f2efdb17928924c9c935c136dea764a081032006 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Sat, 16 Oct 2021 10:49:06 +0200
Subject: u64_stats: Introduce u64_stats_set()

Allow to directly set a u64_stats_t value which is used to provide an init
function which sets it directly to zero intead of memset() the value.

Add u64_stats_set() to the u64_stats API.

[bigeasy: commit message. ]

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/u64_stats_sync.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index e81856c0ba13..e8ec116c916b 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -83,6 +83,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p)
 	return local64_read(&p->v);
 }
 
+static inline void u64_stats_set(u64_stats_t *p, u64 val)
+{
+	local64_set(&p->v, val);
+}
+
 static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
 {
 	local64_add(val, &p->v);
@@ -104,6 +109,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p)
 	return p->v;
 }
 
+static inline void u64_stats_set(u64_stats_t *p, u64 val)
+{
+	p->v = val;
+}
+
 static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
 {
 	p->v += val;
-- 
cgit v1.2.3


From 29cbcd85828372333aa87542c51f2b2b0fd4380c Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Sat, 16 Oct 2021 10:49:10 +0200
Subject: net: sched: Remove Qdisc::running sequence counter

The Qdisc::running sequence counter has two uses:

  1. Reliably reading qdisc's tc statistics while the qdisc is running
     (a seqcount read/retry loop at gnet_stats_add_basic()).

  2. As a flag, indicating whether the qdisc in question is running
     (without any retry loops).

For the first usage, the Qdisc::running sequence counter write section,
qdisc_run_begin() => qdisc_run_end(), covers a much wider area than what
is actually needed: the raw qdisc's bstats update. A u64_stats sync
point was thus introduced (in previous commits) inside the bstats
structure itself. A local u64_stats write section is then started and
stopped for the bstats updates.

Use that u64_stats sync point mechanism for the bstats read/retry loop
at gnet_stats_add_basic().

For the second qdisc->running usage, a __QDISC_STATE_RUNNING bit flag,
accessed with atomic bitops, is sufficient. Using a bit flag instead of
a sequence counter at qdisc_run_begin/end() and qdisc_is_running() leads
to the SMP barriers implicitly added through raw_read_seqcount() and
write_seqcount_begin/end() getting removed. All call sites have been
surveyed though, and no required ordering was identified.

Now that the qdisc->running sequence counter is no longer used, remove
it.

Note, using u64_stats implies no sequence counter protection for 64-bit
architectures. This can lead to the qdisc tc statistics "packets" vs.
"bytes" values getting out of sync on rare occasions. The individual
values will still be valid.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 173984414f38..f9cd6fea213f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1916,7 +1916,6 @@ enum netdev_ml_priv_type {
  *	@sfp_bus:	attached &struct sfp_bus structure.
  *
  *	@qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
- *	@qdisc_running_key: lockdep class annotating Qdisc->running seqcount
  *
  *	@proto_down:	protocol port state information can be sent to the
  *			switch driver and used to set the phys state of the
@@ -2250,7 +2249,6 @@ struct net_device {
 	struct phy_device	*phydev;
 	struct sfp_bus		*sfp_bus;
 	struct lock_class_key	*qdisc_tx_busylock;
-	struct lock_class_key	*qdisc_running_key;
 	bool			proto_down;
 	unsigned		wol_enabled:1;
 	unsigned		threaded:1;
@@ -2360,13 +2358,11 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 #define netdev_lockdep_set_classes(dev)				\
 {								\
 	static struct lock_class_key qdisc_tx_busylock_key;	\
-	static struct lock_class_key qdisc_running_key;		\
 	static struct lock_class_key qdisc_xmit_lock_key;	\
 	static struct lock_class_key dev_addr_list_lock_key;	\
 	unsigned int i;						\
 								\
 	(dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key;	\
-	(dev)->qdisc_running_key = &qdisc_running_key;		\
 	lockdep_set_class(&(dev)->addr_list_lock,		\
 			  &dev_addr_list_lock_key);		\
 	for (i = 0; i < (dev)->num_tx_queues; i++)		\
-- 
cgit v1.2.3


From 82a8daaecfd9382e9450a05f86be8a274cf69a27 Mon Sep 17 00:00:00 2001
From: Marc Bonnici <marc.bonnici@arm.com>
Date: Fri, 15 Oct 2021 17:57:42 +0100
Subject: firmware: arm_ffa: Add support for MEM_LEND

As part of the FF-A spec, an endpoint is allowed to transfer access of,
or lend, a memory region to one or more borrowers.

Extend the existing memory sharing implementation to support
FF-A MEM_LEND functionality and expose this to other kernel drivers.

Note that upon a successful MEM_LEND request the caller must ensure that
the memory region specified is not accessed until a successful
MEM_RECALIM call has been made. On systems with a hypervisor present
this will been enforced, however on systems without a hypervisor the
responsibility falls to the calling kernel driver to prevent access.

Link: https://lore.kernel.org/r/20211015165742.2513065-1-marc.bonnici@arm.com
Reviewed-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Marc Bonnici <marc.bonnici@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 505c679b6a9b..85651e41ded8 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -262,6 +262,8 @@ struct ffa_dev_ops {
 	int (*memory_reclaim)(u64 g_handle, u32 flags);
 	int (*memory_share)(struct ffa_device *dev,
 			    struct ffa_mem_ops_args *args);
+	int (*memory_lend)(struct ffa_device *dev,
+			   struct ffa_mem_ops_args *args);
 };
 
 #endif /* _LINUX_ARM_FFA_H */
-- 
cgit v1.2.3


From 348332e000697b4ca82ef96719e02876434b8346 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:12 +0200
Subject: mm: don't include <linux/blk-cgroup.h> in <linux/writeback.h>

blk-cgroup.h pulls in blkdev.h and thus pretty much all the block
headers.  Break this dependency chain by turning wbc_blkcg_css into a
macro and dropping the blk-cgroup.h include.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/writeback.h | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d1f65adf6a26..8eb165760752 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -11,7 +11,6 @@
 #include <linux/flex_proportions.h>
 #include <linux/backing-dev-defs.h>
 #include <linux/blk_types.h>
-#include <linux/blk-cgroup.h>
 
 struct bio;
 
@@ -109,15 +108,12 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc)
 	return flags;
 }
 
-static inline struct cgroup_subsys_state *
-wbc_blkcg_css(struct writeback_control *wbc)
-{
 #ifdef CONFIG_CGROUP_WRITEBACK
-	if (wbc->wb)
-		return wbc->wb->blkcg_css;
-#endif
-	return blkcg_root_css;
-}
+#define wbc_blkcg_css(wbc) \
+	((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css)
+#else
+#define wbc_blkcg_css(wbc)		(blkcg_root_css)
+#endif /* CONFIG_CGROUP_WRITEBACK */
 
 /*
  * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
-- 
cgit v1.2.3


From e41d12f539f7c8bac9b0897031fee6cc9158a6be Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:13 +0200
Subject: mm: don't include <linux/blk-cgroup.h> in <linux/backing-dev.h>

There is no need to pull blk-cgroup.h and thus blkdev.h in here, so
break the include chain.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/backing-dev.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index ac7f231b8825..843bf4be675f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -15,10 +15,11 @@
 #include <linux/blkdev.h>
 #include <linux/device.h>
 #include <linux/writeback.h>
-#include <linux/blk-cgroup.h>
 #include <linux/backing-dev-defs.h>
 #include <linux/slab.h>
 
+struct blkcg;
+
 static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
 {
 	kref_get(&bdi->refcnt);
-- 
cgit v1.2.3


From ccdf774189b6466457ca9c7de1fe9ed18547d249 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:14 +0200
Subject: mm: don't include <linux/blkdev.h> in <linux/backing-dev.h>

Move inode_to_bdi out of line to avoid having to include blkdev.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/backing-dev.h | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 843bf4be675f..4ac7ce096013 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <linux/blkdev.h>
 #include <linux/device.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev-defs.h>
@@ -134,20 +133,7 @@ static inline bool writeback_in_progress(struct bdi_writeback *wb)
 	return test_bit(WB_writeback_running, &wb->state);
 }
 
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
-{
-	struct super_block *sb;
-
-	if (!inode)
-		return &noop_backing_dev_info;
-
-	sb = inode->i_sb;
-#ifdef CONFIG_BLOCK
-	if (sb_is_blkdev_sb(sb))
-		return I_BDEV(inode)->bd_disk->bdi;
-#endif
-	return sb->s_bdi;
-}
+struct backing_dev_info *inode_to_bdi(struct inode *inode);
 
 static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 {
-- 
cgit v1.2.3


From 1d9433cdd04adf7cfd5e3ef81f5acb29d80aae9b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:19 +0200
Subject: block: remove the unused rq_end_sector macro

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/elevator.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index ef9ceead3db1..80633f3b600c 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -162,7 +162,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
 #define ELEVATOR_INSERT_FLUSH	5
 #define ELEVATOR_INSERT_SORT_MERGE	6
 
-#define rq_end_sector(rq)	(blk_rq_pos(rq) + blk_rq_sectors(rq))
 #define rb_entry_rq(node)	rb_entry((node), struct request, rb_node)
 
 #define rq_entry_fifo(ptr)	list_entry((ptr), struct request, queuelist)
-- 
cgit v1.2.3


From 90138237a56282f99d71130ab7c68903a2eba5a7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:20 +0200
Subject: block: remove the unused blk_queue_state enum

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-10-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 12b9dbcc980e..9e367509bea1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -262,11 +262,6 @@ enum blk_eh_timer_return {
 	BLK_EH_RESET_TIMER,	/* reset timer and try again */
 };
 
-enum blk_queue_state {
-	Queue_down,
-	Queue_up,
-};
-
 #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
 #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
 
-- 
cgit v1.2.3


From 713e4e11088875bf7aee3df81b167c8b2f6c5d65 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:21 +0200
Subject: block: remove the cmd_size field from struct request_queue

Entirely unused.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9e367509bea1..6737e484280d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -544,8 +544,6 @@ struct request_queue {
 
 	bool			mq_sysfs_init_done;
 
-	size_t			cmd_size;
-
 #define BLK_MAX_WRITE_HINTS	5
 	u64			write_hints[BLK_MAX_WRITE_HINTS];
 };
-- 
cgit v1.2.3


From 9778ac77c2027827ffdbb33d3e936b3a0ae9f0f9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:22 +0200
Subject: block: remove the struct blk_queue_ctx forward declaration

This type doesn't exist at all, so no need to forward declare it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6737e484280d..4bcbb1ae2d66 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -253,8 +253,6 @@ static inline unsigned short req_get_ioprio(struct request *req)
 
 #include <linux/elevator.h>
 
-struct blk_queue_ctx;
-
 struct bio_vec;
 
 enum blk_eh_timer_return {
-- 
cgit v1.2.3


From 2e9bc3465ac54d282b855b073409c2c3a7d1ae00 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:23 +0200
Subject: block: move elevator.h to block/

Except for the features passed to blk_queue_required_elevator_features,
elevator.h is only needed internally to the block layer.  Move the
ELEVATOR_F_* definitions to blkdev.h, and the move elevator.h to
block/, dropping all the spurious includes outside of that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-13-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h   |  11 ++-
 include/linux/elevator.h | 180 -----------------------------------------------
 2 files changed, 9 insertions(+), 182 deletions(-)
 delete mode 100644 include/linux/elevator.h

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4bcbb1ae2d66..d815238f61ed 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -251,8 +251,6 @@ static inline unsigned short req_get_ioprio(struct request *req)
 	return req->ioprio;
 }
 
-#include <linux/elevator.h>
-
 struct bio_vec;
 
 enum blk_eh_timer_return {
@@ -1124,6 +1122,15 @@ extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
+
+/*
+ * Elevator features for blk_queue_required_elevator_features:
+ */
+/* Supports zoned block devices sequential write constraint */
+#define ELEVATOR_F_ZBD_SEQ_WRITE	(1U << 0)
+/* Supports scheduling on multiple hardware queues */
+#define ELEVATOR_F_MQ_AWARE		(1U << 1)
+
 extern void blk_queue_required_elevator_features(struct request_queue *q,
 						 unsigned int features);
 extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
deleted file mode 100644
index 80633f3b600c..000000000000
--- a/include/linux/elevator.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_ELEVATOR_H
-#define _LINUX_ELEVATOR_H
-
-#include <linux/percpu.h>
-#include <linux/hashtable.h>
-
-#ifdef CONFIG_BLOCK
-
-struct io_cq;
-struct elevator_type;
-#ifdef CONFIG_BLK_DEBUG_FS
-struct blk_mq_debugfs_attr;
-#endif
-
-/*
- * Return values from elevator merger
- */
-enum elv_merge {
-	ELEVATOR_NO_MERGE	= 0,
-	ELEVATOR_FRONT_MERGE	= 1,
-	ELEVATOR_BACK_MERGE	= 2,
-	ELEVATOR_DISCARD_MERGE	= 3,
-};
-
-struct blk_mq_alloc_data;
-struct blk_mq_hw_ctx;
-
-struct elevator_mq_ops {
-	int (*init_sched)(struct request_queue *, struct elevator_type *);
-	void (*exit_sched)(struct elevator_queue *);
-	int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
-	void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
-	void (*depth_updated)(struct blk_mq_hw_ctx *);
-
-	bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
-	bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int);
-	int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
-	void (*request_merged)(struct request_queue *, struct request *, enum elv_merge);
-	void (*requests_merged)(struct request_queue *, struct request *, struct request *);
-	void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *);
-	void (*prepare_request)(struct request *);
-	void (*finish_request)(struct request *);
-	void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
-	struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
-	bool (*has_work)(struct blk_mq_hw_ctx *);
-	void (*completed_request)(struct request *, u64);
-	void (*requeue_request)(struct request *);
-	struct request *(*former_request)(struct request_queue *, struct request *);
-	struct request *(*next_request)(struct request_queue *, struct request *);
-	void (*init_icq)(struct io_cq *);
-	void (*exit_icq)(struct io_cq *);
-};
-
-#define ELV_NAME_MAX	(16)
-
-struct elv_fs_entry {
-	struct attribute attr;
-	ssize_t (*show)(struct elevator_queue *, char *);
-	ssize_t (*store)(struct elevator_queue *, const char *, size_t);
-};
-
-/*
- * identifies an elevator type, such as AS or deadline
- */
-struct elevator_type
-{
-	/* managed by elevator core */
-	struct kmem_cache *icq_cache;
-
-	/* fields provided by elevator implementation */
-	struct elevator_mq_ops ops;
-
-	size_t icq_size;	/* see iocontext.h */
-	size_t icq_align;	/* ditto */
-	struct elv_fs_entry *elevator_attrs;
-	const char *elevator_name;
-	const char *elevator_alias;
-	const unsigned int elevator_features;
-	struct module *elevator_owner;
-#ifdef CONFIG_BLK_DEBUG_FS
-	const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
-	const struct blk_mq_debugfs_attr *hctx_debugfs_attrs;
-#endif
-
-	/* managed by elevator core */
-	char icq_cache_name[ELV_NAME_MAX + 6];	/* elvname + "_io_cq" */
-	struct list_head list;
-};
-
-#define ELV_HASH_BITS 6
-
-void elv_rqhash_del(struct request_queue *q, struct request *rq);
-void elv_rqhash_add(struct request_queue *q, struct request *rq);
-void elv_rqhash_reposition(struct request_queue *q, struct request *rq);
-struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
-
-/*
- * each queue has an elevator_queue associated with it
- */
-struct elevator_queue
-{
-	struct elevator_type *type;
-	void *elevator_data;
-	struct kobject kobj;
-	struct mutex sysfs_lock;
-	unsigned int registered:1;
-	DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
-};
-
-/*
- * block elevator interface
- */
-extern enum elv_merge elv_merge(struct request_queue *, struct request **,
-		struct bio *);
-extern void elv_merge_requests(struct request_queue *, struct request *,
-			       struct request *);
-extern void elv_merged_request(struct request_queue *, struct request *,
-		enum elv_merge);
-extern bool elv_attempt_insert_merge(struct request_queue *, struct request *,
-				     struct list_head *);
-extern struct request *elv_former_request(struct request_queue *, struct request *);
-extern struct request *elv_latter_request(struct request_queue *, struct request *);
-void elevator_init_mq(struct request_queue *q);
-
-/*
- * io scheduler registration
- */
-extern int elv_register(struct elevator_type *);
-extern void elv_unregister(struct elevator_type *);
-
-/*
- * io scheduler sysfs switching
- */
-extern ssize_t elv_iosched_show(struct request_queue *, char *);
-extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
-
-extern bool elv_bio_merge_ok(struct request *, struct bio *);
-extern struct elevator_queue *elevator_alloc(struct request_queue *,
-					struct elevator_type *);
-
-/*
- * Helper functions.
- */
-extern struct request *elv_rb_former_request(struct request_queue *, struct request *);
-extern struct request *elv_rb_latter_request(struct request_queue *, struct request *);
-
-/*
- * rb support functions.
- */
-extern void elv_rb_add(struct rb_root *, struct request *);
-extern void elv_rb_del(struct rb_root *, struct request *);
-extern struct request *elv_rb_find(struct rb_root *, sector_t);
-
-/*
- * Insertion selection
- */
-#define ELEVATOR_INSERT_FRONT	1
-#define ELEVATOR_INSERT_BACK	2
-#define ELEVATOR_INSERT_SORT	3
-#define ELEVATOR_INSERT_REQUEUE	4
-#define ELEVATOR_INSERT_FLUSH	5
-#define ELEVATOR_INSERT_SORT_MERGE	6
-
-#define rb_entry_rq(node)	rb_entry((node), struct request, rb_node)
-
-#define rq_entry_fifo(ptr)	list_entry((ptr), struct request, queuelist)
-#define rq_fifo_clear(rq)	list_del_init(&(rq)->queuelist)
-
-/*
- * Elevator features.
- */
-
-/* Supports zoned block devices sequential write constraint */
-#define ELEVATOR_F_ZBD_SEQ_WRITE	(1U << 0)
-/* Supports scheduling on multiple hardware queues */
-#define ELEVATOR_F_MQ_AWARE		(1U << 1)
-
-#endif /* CONFIG_BLOCK */
-#endif
-- 
cgit v1.2.3


From 3ab0bc78e96bd03a5096e4801550926d81b3e19d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:24 +0200
Subject: block: drop unused includes in <linux/blkdev.h>

Drop various include not actually used in blkdev.h itself.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-14-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d815238f61ed..46a703394f7f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -3,8 +3,6 @@
 #define _LINUX_BLKDEV_H
 
 #include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/major.h>
 #include <linux/genhd.h>
 #include <linux/list.h>
 #include <linux/llist.h>
@@ -12,17 +10,12 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/wait.h>
-#include <linux/mempool.h>
-#include <linux/pfn.h>
 #include <linux/bio.h>
-#include <linux/stringify.h>
 #include <linux/gfp.h>
-#include <linux/smp.h>
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
 #include <linux/blkzoned.h>
-#include <linux/pm.h>
 #include <linux/sbitmap.h>
 
 struct module;
-- 
cgit v1.2.3


From b81e0c2372e65e5627864ba034433b64b2fc73f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:25 +0200
Subject: block: drop unused includes in <linux/genhd.h>

Drop various include not actually used in genhd.h itself, and
move the remaning includes closer together.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-15-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h     | 14 ++------------
 include/linux/part_stat.h |  1 +
 2 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 0f5315c2b5a3..0b48a0cf4262 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -12,12 +12,10 @@
 
 #include <linux/types.h>
 #include <linux/kdev_t.h>
-#include <linux/rcupdate.h>
-#include <linux/slab.h>
-#include <linux/percpu-refcount.h>
 #include <linux/uuid.h>
 #include <linux/blk_types.h>
-#include <asm/local.h>
+#include <linux/device.h>
+#include <linux/xarray.h>
 
 extern const struct device_type disk_type;
 extern struct device_type part_type;
@@ -26,14 +24,6 @@ extern struct class block_class;
 #define DISK_MAX_PARTS			256
 #define DISK_NAME_LEN			32
 
-#include <linux/major.h>
-#include <linux/device.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/workqueue.h>
-#include <linux/xarray.h>
-
 #define PARTITION_META_INFO_VOLNAMELTH	64
 /*
  * Enough for the string representation of any kind of UUID plus NULL.
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index d2558121d48c..6f7949b2fd8d 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -3,6 +3,7 @@
 #define _LINUX_PART_STAT_H
 
 #include <linux/genhd.h>
+#include <asm/local.h>
 
 struct disk_stats {
 	u64 nsecs[NR_STAT_GROUPS];
-- 
cgit v1.2.3


From badf7f64378796d460c79eb0f182fa7282eb65d5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:26 +0200
Subject: block: move a few merge helpers out of <linux/blkdev.h>

These are block-layer internal helpers, so move them to block/blk.h and
block/blk-merge.c.  Also update a comment a bit to use better grammar.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-16-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 64 --------------------------------------------------
 1 file changed, 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 46a703394f7f..be534040ca9c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -745,37 +745,6 @@ static inline bool rq_is_sync(struct request *rq)
 	return op_is_sync(rq->cmd_flags);
 }
 
-static inline bool rq_mergeable(struct request *rq)
-{
-	if (blk_rq_is_passthrough(rq))
-		return false;
-
-	if (req_op(rq) == REQ_OP_FLUSH)
-		return false;
-
-	if (req_op(rq) == REQ_OP_WRITE_ZEROES)
-		return false;
-
-	if (req_op(rq) == REQ_OP_ZONE_APPEND)
-		return false;
-
-	if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
-		return false;
-	if (rq->rq_flags & RQF_NOMERGE_FLAGS)
-		return false;
-
-	return true;
-}
-
-static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
-{
-	if (bio_page(a) == bio_page(b) &&
-	    bio_offset(a) == bio_offset(b))
-		return true;
-
-	return false;
-}
-
 static inline unsigned int blk_queue_depth(struct request_queue *q)
 {
 	if (q->queue_depth)
@@ -1030,23 +999,6 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q,
 	return min(q->limits.max_sectors, chunk_sectors);
 }
 
-static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
-						  sector_t offset)
-{
-	struct request_queue *q = rq->q;
-
-	if (blk_rq_is_passthrough(rq))
-		return q->limits.max_hw_sectors;
-
-	if (!q->limits.chunk_sectors ||
-	    req_op(rq) == REQ_OP_DISCARD ||
-	    req_op(rq) == REQ_OP_SECURE_ERASE)
-		return blk_queue_get_max_sectors(q, req_op(rq));
-
-	return min(blk_max_size_offset(q, offset, 0),
-			blk_queue_get_max_sectors(q, req_op(rq)));
-}
-
 static inline unsigned int blk_rq_count_bios(struct request *rq)
 {
 	unsigned int nr_bios = 0;
@@ -1490,22 +1442,6 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
 	return offset << SECTOR_SHIFT;
 }
 
-/*
- * Two cases of handling DISCARD merge:
- * If max_discard_segments > 1, the driver takes every bio
- * as a range and send them to controller together. The ranges
- * needn't to be contiguous.
- * Otherwise, the bios/requests will be handled as same as
- * others which should be contiguous.
- */
-static inline bool blk_discard_mergable(struct request *req)
-{
-	if (req_op(req) == REQ_OP_DISCARD &&
-	    queue_max_discard_segments(req->q) > 1)
-		return true;
-	return false;
-}
-
 static inline int bdev_discard_alignment(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
-- 
cgit v1.2.3


From fe45e630a1035aea94c29016f2598bbde149bbe3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:27 +0200
Subject: block: move integrity handling out of <linux/blkdev.h>

Split the integrity/metadata handling definitions out into a new header.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-17-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-integrity.h | 183 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h        | 183 ------------------------------------------
 2 files changed, 183 insertions(+), 183 deletions(-)
 create mode 100644 include/linux/blk-integrity.h

(limited to 'include/linux')

diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
new file mode 100644
index 000000000000..8a038ea0717e
--- /dev/null
+++ b/include/linux/blk-integrity.h
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BLK_INTEGRITY_H
+#define _LINUX_BLK_INTEGRITY_H
+
+#include <linux/blk-mq.h>
+
+struct request;
+
+enum blk_integrity_flags {
+	BLK_INTEGRITY_VERIFY		= 1 << 0,
+	BLK_INTEGRITY_GENERATE		= 1 << 1,
+	BLK_INTEGRITY_DEVICE_CAPABLE	= 1 << 2,
+	BLK_INTEGRITY_IP_CHECKSUM	= 1 << 3,
+};
+
+struct blk_integrity_iter {
+	void			*prot_buf;
+	void			*data_buf;
+	sector_t		seed;
+	unsigned int		data_size;
+	unsigned short		interval;
+	const char		*disk_name;
+};
+
+typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
+typedef void (integrity_prepare_fn) (struct request *);
+typedef void (integrity_complete_fn) (struct request *, unsigned int);
+
+struct blk_integrity_profile {
+	integrity_processing_fn		*generate_fn;
+	integrity_processing_fn		*verify_fn;
+	integrity_prepare_fn		*prepare_fn;
+	integrity_complete_fn		*complete_fn;
+	const char			*name;
+};
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+void blk_integrity_register(struct gendisk *, struct blk_integrity *);
+void blk_integrity_unregister(struct gendisk *);
+int blk_integrity_compare(struct gendisk *, struct gendisk *);
+int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
+				   struct scatterlist *);
+int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
+
+static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+{
+	struct blk_integrity *bi = &disk->queue->integrity;
+
+	if (!bi->profile)
+		return NULL;
+
+	return bi;
+}
+
+static inline struct blk_integrity *
+bdev_get_integrity(struct block_device *bdev)
+{
+	return blk_get_integrity(bdev->bd_disk);
+}
+
+static inline bool
+blk_integrity_queue_supports_integrity(struct request_queue *q)
+{
+	return q->integrity.profile;
+}
+
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+						    unsigned int segs)
+{
+	q->limits.max_integrity_segments = segs;
+}
+
+static inline unsigned short
+queue_max_integrity_segments(const struct request_queue *q)
+{
+	return q->limits.max_integrity_segments;
+}
+
+/**
+ * bio_integrity_intervals - Return number of integrity intervals for a bio
+ * @bi:		blk_integrity profile for device
+ * @sectors:	Size of the bio in 512-byte sectors
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the data integrity
+ * interval size of the storage device.  Convert the block layer sectors
+ * to the appropriate number of integrity intervals.
+ */
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+						   unsigned int sectors)
+{
+	return sectors >> (bi->interval_exp - 9);
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+					       unsigned int sectors)
+{
+	return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
+}
+
+static inline bool blk_integrity_rq(struct request *rq)
+{
+	return rq->cmd_flags & REQ_INTEGRITY;
+}
+
+/*
+ * Return the first bvec that contains integrity data.  Only drivers that are
+ * limited to a single integrity segment should use this helper.
+ */
+static inline struct bio_vec *rq_integrity_vec(struct request *rq)
+{
+	if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1))
+		return NULL;
+	return rq->bio->bi_integrity->bip_vec;
+}
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static inline int blk_rq_count_integrity_sg(struct request_queue *q,
+					    struct bio *b)
+{
+	return 0;
+}
+static inline int blk_rq_map_integrity_sg(struct request_queue *q,
+					  struct bio *b,
+					  struct scatterlist *s)
+{
+	return 0;
+}
+static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
+{
+	return NULL;
+}
+static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+{
+	return NULL;
+}
+static inline bool
+blk_integrity_queue_supports_integrity(struct request_queue *q)
+{
+	return false;
+}
+static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
+{
+	return 0;
+}
+static inline void blk_integrity_register(struct gendisk *d,
+					 struct blk_integrity *b)
+{
+}
+static inline void blk_integrity_unregister(struct gendisk *d)
+{
+}
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+						    unsigned int segs)
+{
+}
+static inline unsigned short
+queue_max_integrity_segments(const struct request_queue *q)
+{
+	return 0;
+}
+
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+						   unsigned int sectors)
+{
+	return 0;
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+					       unsigned int sectors)
+{
+	return 0;
+}
+static inline int blk_integrity_rq(struct request *rq)
+{
+	return 0;
+}
+
+static inline struct bio_vec *rq_integrity_vec(struct request *rq)
+{
+	return NULL;
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+#endif /* _LINUX_BLK_INTEGRITY_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index be534040ca9c..56e60e5c09d0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1555,189 +1555,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-*")
 
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-enum blk_integrity_flags {
-	BLK_INTEGRITY_VERIFY		= 1 << 0,
-	BLK_INTEGRITY_GENERATE		= 1 << 1,
-	BLK_INTEGRITY_DEVICE_CAPABLE	= 1 << 2,
-	BLK_INTEGRITY_IP_CHECKSUM	= 1 << 3,
-};
-
-struct blk_integrity_iter {
-	void			*prot_buf;
-	void			*data_buf;
-	sector_t		seed;
-	unsigned int		data_size;
-	unsigned short		interval;
-	const char		*disk_name;
-};
-
-typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
-typedef void (integrity_prepare_fn) (struct request *);
-typedef void (integrity_complete_fn) (struct request *, unsigned int);
-
-struct blk_integrity_profile {
-	integrity_processing_fn		*generate_fn;
-	integrity_processing_fn		*verify_fn;
-	integrity_prepare_fn		*prepare_fn;
-	integrity_complete_fn		*complete_fn;
-	const char			*name;
-};
-
-extern void blk_integrity_register(struct gendisk *, struct blk_integrity *);
-extern void blk_integrity_unregister(struct gendisk *);
-extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
-extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
-				   struct scatterlist *);
-extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
-
-static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
-{
-	struct blk_integrity *bi = &disk->queue->integrity;
-
-	if (!bi->profile)
-		return NULL;
-
-	return bi;
-}
-
-static inline
-struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
-{
-	return blk_get_integrity(bdev->bd_disk);
-}
-
-static inline bool
-blk_integrity_queue_supports_integrity(struct request_queue *q)
-{
-	return q->integrity.profile;
-}
-
-static inline bool blk_integrity_rq(struct request *rq)
-{
-	return rq->cmd_flags & REQ_INTEGRITY;
-}
-
-static inline void blk_queue_max_integrity_segments(struct request_queue *q,
-						    unsigned int segs)
-{
-	q->limits.max_integrity_segments = segs;
-}
-
-static inline unsigned short
-queue_max_integrity_segments(const struct request_queue *q)
-{
-	return q->limits.max_integrity_segments;
-}
-
-/**
- * bio_integrity_intervals - Return number of integrity intervals for a bio
- * @bi:		blk_integrity profile for device
- * @sectors:	Size of the bio in 512-byte sectors
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the data integrity
- * interval size of the storage device.  Convert the block layer sectors
- * to the appropriate number of integrity intervals.
- */
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
-						   unsigned int sectors)
-{
-	return sectors >> (bi->interval_exp - 9);
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
-					       unsigned int sectors)
-{
-	return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
-}
-
-/*
- * Return the first bvec that contains integrity data.  Only drivers that are
- * limited to a single integrity segment should use this helper.
- */
-static inline struct bio_vec *rq_integrity_vec(struct request *rq)
-{
-	if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1))
-		return NULL;
-	return rq->bio->bi_integrity->bip_vec;
-}
-
-#else /* CONFIG_BLK_DEV_INTEGRITY */
-
-struct bio;
-struct block_device;
-struct gendisk;
-struct blk_integrity;
-
-static inline int blk_integrity_rq(struct request *rq)
-{
-	return 0;
-}
-static inline int blk_rq_count_integrity_sg(struct request_queue *q,
-					    struct bio *b)
-{
-	return 0;
-}
-static inline int blk_rq_map_integrity_sg(struct request_queue *q,
-					  struct bio *b,
-					  struct scatterlist *s)
-{
-	return 0;
-}
-static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
-{
-	return NULL;
-}
-static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
-{
-	return NULL;
-}
-static inline bool
-blk_integrity_queue_supports_integrity(struct request_queue *q)
-{
-	return false;
-}
-static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
-{
-	return 0;
-}
-static inline void blk_integrity_register(struct gendisk *d,
-					 struct blk_integrity *b)
-{
-}
-static inline void blk_integrity_unregister(struct gendisk *d)
-{
-}
-static inline void blk_queue_max_integrity_segments(struct request_queue *q,
-						    unsigned int segs)
-{
-}
-static inline unsigned short queue_max_integrity_segments(const struct request_queue *q)
-{
-	return 0;
-}
-
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
-						   unsigned int sectors)
-{
-	return 0;
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
-					       unsigned int sectors)
-{
-	return 0;
-}
-
-static inline struct bio_vec *rq_integrity_vec(struct request *rq)
-{
-	return NULL;
-}
-
-#endif /* CONFIG_BLK_DEV_INTEGRITY */
-
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 
 bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q);
-- 
cgit v1.2.3


From 24b83deb29b7f06a5573b65f2ce96f5482755d43 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Sep 2021 14:33:28 +0200
Subject: block: move struct request to blk-mq.h

struct request is only used by blk-mq drivers, so move it and all
related declarations to blk-mq.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210920123328.1399408-18-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h       | 465 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk_types.h    |   2 -
 include/linux/blkdev.h       | 469 +------------------------------------------
 include/linux/blktrace_api.h |   2 +-
 include/linux/t10-pi.h       |   2 +-
 5 files changed, 468 insertions(+), 472 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 13ba1861e688..bd4086a6f28e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -6,10 +6,218 @@
 #include <linux/sbitmap.h>
 #include <linux/srcu.h>
 #include <linux/lockdep.h>
+#include <linux/scatterlist.h>
 
 struct blk_mq_tags;
 struct blk_flush_queue;
 
+#define BLKDEV_MIN_RQ	4
+#define BLKDEV_MAX_RQ	128	/* Default maximum */
+
+typedef void (rq_end_io_fn)(struct request *, blk_status_t);
+
+/*
+ * request flags */
+typedef __u32 __bitwise req_flags_t;
+
+/* drive already may have started this one */
+#define RQF_STARTED		((__force req_flags_t)(1 << 1))
+/* may not be passed by ioscheduler */
+#define RQF_SOFTBARRIER		((__force req_flags_t)(1 << 3))
+/* request for flush sequence */
+#define RQF_FLUSH_SEQ		((__force req_flags_t)(1 << 4))
+/* merge of different types, fail separately */
+#define RQF_MIXED_MERGE		((__force req_flags_t)(1 << 5))
+/* track inflight for MQ */
+#define RQF_MQ_INFLIGHT		((__force req_flags_t)(1 << 6))
+/* don't call prep for this one */
+#define RQF_DONTPREP		((__force req_flags_t)(1 << 7))
+/* vaguely specified driver internal error.  Ignored by the block layer */
+#define RQF_FAILED		((__force req_flags_t)(1 << 10))
+/* don't warn about errors */
+#define RQF_QUIET		((__force req_flags_t)(1 << 11))
+/* elevator private data attached */
+#define RQF_ELVPRIV		((__force req_flags_t)(1 << 12))
+/* account into disk and partition IO statistics */
+#define RQF_IO_STAT		((__force req_flags_t)(1 << 13))
+/* runtime pm request */
+#define RQF_PM			((__force req_flags_t)(1 << 15))
+/* on IO scheduler merge hash */
+#define RQF_HASHED		((__force req_flags_t)(1 << 16))
+/* track IO completion time */
+#define RQF_STATS		((__force req_flags_t)(1 << 17))
+/* Look at ->special_vec for the actual data payload instead of the
+   bio chain. */
+#define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
+/* The per-zone write lock is held for this request */
+#define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
+/* already slept for hybrid poll */
+#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 20))
+/* ->timeout has been called, don't expire again */
+#define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
+
+/* flags that prevent us from merging requests: */
+#define RQF_NOMERGE_FLAGS \
+	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
+
+enum mq_rq_state {
+	MQ_RQ_IDLE		= 0,
+	MQ_RQ_IN_FLIGHT		= 1,
+	MQ_RQ_COMPLETE		= 2,
+};
+
+/*
+ * Try to put the fields that are referenced together in the same cacheline.
+ *
+ * If you modify this structure, make sure to update blk_rq_init() and
+ * especially blk_mq_rq_ctx_init() to take care of the added fields.
+ */
+struct request {
+	struct request_queue *q;
+	struct blk_mq_ctx *mq_ctx;
+	struct blk_mq_hw_ctx *mq_hctx;
+
+	unsigned int cmd_flags;		/* op and common flags */
+	req_flags_t rq_flags;
+
+	int tag;
+	int internal_tag;
+
+	/* the following two fields are internal, NEVER access directly */
+	unsigned int __data_len;	/* total data len */
+	sector_t __sector;		/* sector cursor */
+
+	struct bio *bio;
+	struct bio *biotail;
+
+	struct list_head queuelist;
+
+	/*
+	 * The hash is used inside the scheduler, and killed once the
+	 * request reaches the dispatch list. The ipi_list is only used
+	 * to queue the request for softirq completion, which is long
+	 * after the request has been unhashed (and even removed from
+	 * the dispatch list).
+	 */
+	union {
+		struct hlist_node hash;	/* merge hash */
+		struct llist_node ipi_list;
+	};
+
+	/*
+	 * The rb_node is only used inside the io scheduler, requests
+	 * are pruned when moved to the dispatch queue. So let the
+	 * completion_data share space with the rb_node.
+	 */
+	union {
+		struct rb_node rb_node;	/* sort/lookup */
+		struct bio_vec special_vec;
+		void *completion_data;
+		int error_count; /* for legacy drivers, don't use */
+	};
+
+	/*
+	 * Three pointers are available for the IO schedulers, if they need
+	 * more they have to dynamically allocate it.  Flush requests are
+	 * never put on the IO scheduler. So let the flush fields share
+	 * space with the elevator data.
+	 */
+	union {
+		struct {
+			struct io_cq		*icq;
+			void			*priv[2];
+		} elv;
+
+		struct {
+			unsigned int		seq;
+			struct list_head	list;
+			rq_end_io_fn		*saved_end_io;
+		} flush;
+	};
+
+	struct gendisk *rq_disk;
+	struct block_device *part;
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+	/* Time that the first bio started allocating this request. */
+	u64 alloc_time_ns;
+#endif
+	/* Time that this request was allocated for this IO. */
+	u64 start_time_ns;
+	/* Time that I/O was submitted to the device. */
+	u64 io_start_time_ns;
+
+#ifdef CONFIG_BLK_WBT
+	unsigned short wbt_flags;
+#endif
+	/*
+	 * rq sectors used for blk stats. It has the same value
+	 * with blk_rq_sectors(rq), except that it never be zeroed
+	 * by completion.
+	 */
+	unsigned short stats_sectors;
+
+	/*
+	 * Number of scatter-gather DMA addr+len pairs after
+	 * physical address coalescing is performed.
+	 */
+	unsigned short nr_phys_segments;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+	unsigned short nr_integrity_segments;
+#endif
+
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+	struct bio_crypt_ctx *crypt_ctx;
+	struct blk_ksm_keyslot *crypt_keyslot;
+#endif
+
+	unsigned short write_hint;
+	unsigned short ioprio;
+
+	enum mq_rq_state state;
+	refcount_t ref;
+
+	unsigned int timeout;
+	unsigned long deadline;
+
+	union {
+		struct __call_single_data csd;
+		u64 fifo_time;
+	};
+
+	/*
+	 * completion callback.
+	 */
+	rq_end_io_fn *end_io;
+	void *end_io_data;
+};
+
+#define req_op(req) \
+	((req)->cmd_flags & REQ_OP_MASK)
+
+static inline bool blk_rq_is_passthrough(struct request *rq)
+{
+	return blk_op_is_passthrough(req_op(rq));
+}
+
+static inline unsigned short req_get_ioprio(struct request *req)
+{
+	return req->ioprio;
+}
+
+#define rq_data_dir(rq)		(op_is_write(req_op(rq)) ? WRITE : READ)
+
+#define rq_dma_dir(rq) \
+	(op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
+
+enum blk_eh_timer_return {
+	BLK_EH_DONE,		/* drivers has completed the command */
+	BLK_EH_RESET_TIMER,	/* reset timer and try again */
+};
+
+#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
+#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
+
 /**
  * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
  * block device
@@ -637,4 +845,261 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio);
 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
 		struct lock_class_key *key);
 
+static inline bool rq_is_sync(struct request *rq)
+{
+	return op_is_sync(rq->cmd_flags);
+}
+
+void blk_rq_init(struct request_queue *q, struct request *rq);
+void blk_put_request(struct request *rq);
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+		blk_mq_req_flags_t flags);
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+		struct bio_set *bs, gfp_t gfp_mask,
+		int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
+void blk_rq_unprep_clone(struct request *rq);
+blk_status_t blk_insert_cloned_request(struct request_queue *q,
+		struct request *rq);
+
+struct rq_map_data {
+	struct page **pages;
+	int page_order;
+	int nr_entries;
+	unsigned long offset;
+	int null_mapped;
+	int from_user;
+};
+
+int blk_rq_map_user(struct request_queue *, struct request *,
+		struct rq_map_data *, void __user *, unsigned long, gfp_t);
+int blk_rq_map_user_iov(struct request_queue *, struct request *,
+		struct rq_map_data *, const struct iov_iter *, gfp_t);
+int blk_rq_unmap_user(struct bio *);
+int blk_rq_map_kern(struct request_queue *, struct request *, void *,
+		unsigned int, gfp_t);
+int blk_rq_append_bio(struct request *rq, struct bio *bio);
+void blk_execute_rq_nowait(struct gendisk *, struct request *, int,
+		rq_end_io_fn *);
+blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
+		int at_head);
+
+struct req_iterator {
+	struct bvec_iter iter;
+	struct bio *bio;
+};
+
+#define __rq_for_each_bio(_bio, rq)	\
+	if ((rq->bio))			\
+		for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
+
+#define rq_for_each_segment(bvl, _rq, _iter)			\
+	__rq_for_each_bio(_iter.bio, _rq)			\
+		bio_for_each_segment(bvl, _iter.bio, _iter.iter)
+
+#define rq_for_each_bvec(bvl, _rq, _iter)			\
+	__rq_for_each_bio(_iter.bio, _rq)			\
+		bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
+
+#define rq_iter_last(bvec, _iter)				\
+		(_iter.bio->bi_next == NULL &&			\
+		 bio_iter_last(bvec, _iter.iter))
+
+/*
+ * blk_rq_pos()			: the current sector
+ * blk_rq_bytes()		: bytes left in the entire request
+ * blk_rq_cur_bytes()		: bytes left in the current segment
+ * blk_rq_err_bytes()		: bytes left till the next error boundary
+ * blk_rq_sectors()		: sectors left in the entire request
+ * blk_rq_cur_sectors()		: sectors left in the current segment
+ * blk_rq_stats_sectors()	: sectors of the entire request used for stats
+ */
+static inline sector_t blk_rq_pos(const struct request *rq)
+{
+	return rq->__sector;
+}
+
+static inline unsigned int blk_rq_bytes(const struct request *rq)
+{
+	return rq->__data_len;
+}
+
+static inline int blk_rq_cur_bytes(const struct request *rq)
+{
+	return rq->bio ? bio_cur_bytes(rq->bio) : 0;
+}
+
+unsigned int blk_rq_err_bytes(const struct request *rq);
+
+static inline unsigned int blk_rq_sectors(const struct request *rq)
+{
+	return blk_rq_bytes(rq) >> SECTOR_SHIFT;
+}
+
+static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
+{
+	return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
+}
+
+static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
+{
+	return rq->stats_sectors;
+}
+
+/*
+ * Some commands like WRITE SAME have a payload or data transfer size which
+ * is different from the size of the request.  Any driver that supports such
+ * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
+ * calculate the data transfer size.
+ */
+static inline unsigned int blk_rq_payload_bytes(struct request *rq)
+{
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+		return rq->special_vec.bv_len;
+	return blk_rq_bytes(rq);
+}
+
+/*
+ * Return the first full biovec in the request.  The caller needs to check that
+ * there are any bvecs before calling this helper.
+ */
+static inline struct bio_vec req_bvec(struct request *rq)
+{
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+		return rq->special_vec;
+	return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
+}
+
+static inline unsigned int blk_rq_count_bios(struct request *rq)
+{
+	unsigned int nr_bios = 0;
+	struct bio *bio;
+
+	__rq_for_each_bio(bio, rq)
+		nr_bios++;
+
+	return nr_bios;
+}
+
+void blk_steal_bios(struct bio_list *list, struct request *rq);
+
+/*
+ * Request completion related functions.
+ *
+ * blk_update_request() completes given number of bytes and updates
+ * the request without completing it.
+ */
+bool blk_update_request(struct request *rq, blk_status_t error,
+			       unsigned int nr_bytes);
+void blk_abort_request(struct request *);
+
+/*
+ * Number of physical segments as sent to the device.
+ *
+ * Normally this is the number of discontiguous data segments sent by the
+ * submitter.  But for data-less command like discard we might have no
+ * actual data segments submitted, but the driver might have to add it's
+ * own special payload.  In that case we still return 1 here so that this
+ * special payload will be mapped.
+ */
+static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
+{
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+		return 1;
+	return rq->nr_phys_segments;
+}
+
+/*
+ * Number of discard segments (or ranges) the driver needs to fill in.
+ * Each discard bio merged into a request is counted as one segment.
+ */
+static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
+{
+	return max_t(unsigned short, rq->nr_phys_segments, 1);
+}
+
+int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
+		struct scatterlist *sglist, struct scatterlist **last_sg);
+static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
+		struct scatterlist *sglist)
+{
+	struct scatterlist *last_sg = NULL;
+
+	return __blk_rq_map_sg(q, rq, sglist, &last_sg);
+}
+void blk_dump_rq_flags(struct request *, char *);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline unsigned int blk_rq_zone_no(struct request *rq)
+{
+	return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+}
+
+static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+{
+	return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+}
+
+bool blk_req_needs_zone_write_lock(struct request *rq);
+bool blk_req_zone_write_trylock(struct request *rq);
+void __blk_req_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_unlock(struct request *rq);
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+	if (blk_req_needs_zone_write_lock(rq))
+		__blk_req_zone_write_lock(rq);
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+	if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
+		__blk_req_zone_write_unlock(rq);
+}
+
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+	return rq->q->seq_zones_wlock &&
+		test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+	if (!blk_req_needs_zone_write_lock(rq))
+		return true;
+	return !blk_req_zone_is_write_locked(rq);
+}
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+	return false;
+}
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+}
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+	return false;
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+	return true;
+}
+#endif /* CONFIG_BLK_DEV_ZONED */
+
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+# error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
 #endif
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+void rq_flush_dcache_pages(struct request *rq);
+#else
+static inline void rq_flush_dcache_pages(struct request *rq)
+{
+}
+#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */
+#endif /* BLK_MQ_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index be622b5a21ed..3b967053e9f5 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -431,8 +431,6 @@ enum stat_group {
 
 #define bio_op(bio) \
 	((bio)->bi_opf & REQ_OP_MASK)
-#define req_op(req) \
-	((req)->cmd_flags & REQ_OP_MASK)
 
 /* obsolete, don't use in new code */
 static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56e60e5c09d0..0e960d74615e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -14,7 +14,6 @@
 #include <linux/gfp.h>
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
-#include <linux/scatterlist.h>
 #include <linux/blkzoned.h>
 #include <linux/sbitmap.h>
 
@@ -32,9 +31,6 @@ struct blk_queue_stats;
 struct blk_stat_callback;
 struct blk_keyslot_manager;
 
-#define BLKDEV_MIN_RQ	4
-#define BLKDEV_MAX_RQ	128	/* Default maximum */
-
 /* Must be consistent with blk_mq_poll_stats_bkt() */
 #define BLK_MQ_POLL_STATS_BKTS 16
 
@@ -47,213 +43,12 @@ struct blk_keyslot_manager;
  */
 #define BLKCG_MAX_POLS		6
 
-typedef void (rq_end_io_fn)(struct request *, blk_status_t);
-
-/*
- * request flags */
-typedef __u32 __bitwise req_flags_t;
-
-/* drive already may have started this one */
-#define RQF_STARTED		((__force req_flags_t)(1 << 1))
-/* may not be passed by ioscheduler */
-#define RQF_SOFTBARRIER		((__force req_flags_t)(1 << 3))
-/* request for flush sequence */
-#define RQF_FLUSH_SEQ		((__force req_flags_t)(1 << 4))
-/* merge of different types, fail separately */
-#define RQF_MIXED_MERGE		((__force req_flags_t)(1 << 5))
-/* track inflight for MQ */
-#define RQF_MQ_INFLIGHT		((__force req_flags_t)(1 << 6))
-/* don't call prep for this one */
-#define RQF_DONTPREP		((__force req_flags_t)(1 << 7))
-/* vaguely specified driver internal error.  Ignored by the block layer */
-#define RQF_FAILED		((__force req_flags_t)(1 << 10))
-/* don't warn about errors */
-#define RQF_QUIET		((__force req_flags_t)(1 << 11))
-/* elevator private data attached */
-#define RQF_ELVPRIV		((__force req_flags_t)(1 << 12))
-/* account into disk and partition IO statistics */
-#define RQF_IO_STAT		((__force req_flags_t)(1 << 13))
-/* runtime pm request */
-#define RQF_PM			((__force req_flags_t)(1 << 15))
-/* on IO scheduler merge hash */
-#define RQF_HASHED		((__force req_flags_t)(1 << 16))
-/* track IO completion time */
-#define RQF_STATS		((__force req_flags_t)(1 << 17))
-/* Look at ->special_vec for the actual data payload instead of the
-   bio chain. */
-#define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
-/* The per-zone write lock is held for this request */
-#define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
-/* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 20))
-/* ->timeout has been called, don't expire again */
-#define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
-
-/* flags that prevent us from merging requests: */
-#define RQF_NOMERGE_FLAGS \
-	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
-
-/*
- * Request state for blk-mq.
- */
-enum mq_rq_state {
-	MQ_RQ_IDLE		= 0,
-	MQ_RQ_IN_FLIGHT		= 1,
-	MQ_RQ_COMPLETE		= 2,
-};
-
-/*
- * Try to put the fields that are referenced together in the same cacheline.
- *
- * If you modify this structure, make sure to update blk_rq_init() and
- * especially blk_mq_rq_ctx_init() to take care of the added fields.
- */
-struct request {
-	struct request_queue *q;
-	struct blk_mq_ctx *mq_ctx;
-	struct blk_mq_hw_ctx *mq_hctx;
-
-	unsigned int cmd_flags;		/* op and common flags */
-	req_flags_t rq_flags;
-
-	int tag;
-	int internal_tag;
-
-	/* the following two fields are internal, NEVER access directly */
-	unsigned int __data_len;	/* total data len */
-	sector_t __sector;		/* sector cursor */
-
-	struct bio *bio;
-	struct bio *biotail;
-
-	struct list_head queuelist;
-
-	/*
-	 * The hash is used inside the scheduler, and killed once the
-	 * request reaches the dispatch list. The ipi_list is only used
-	 * to queue the request for softirq completion, which is long
-	 * after the request has been unhashed (and even removed from
-	 * the dispatch list).
-	 */
-	union {
-		struct hlist_node hash;	/* merge hash */
-		struct llist_node ipi_list;
-	};
-
-	/*
-	 * The rb_node is only used inside the io scheduler, requests
-	 * are pruned when moved to the dispatch queue. So let the
-	 * completion_data share space with the rb_node.
-	 */
-	union {
-		struct rb_node rb_node;	/* sort/lookup */
-		struct bio_vec special_vec;
-		void *completion_data;
-		int error_count; /* for legacy drivers, don't use */
-	};
-
-	/*
-	 * Three pointers are available for the IO schedulers, if they need
-	 * more they have to dynamically allocate it.  Flush requests are
-	 * never put on the IO scheduler. So let the flush fields share
-	 * space with the elevator data.
-	 */
-	union {
-		struct {
-			struct io_cq		*icq;
-			void			*priv[2];
-		} elv;
-
-		struct {
-			unsigned int		seq;
-			struct list_head	list;
-			rq_end_io_fn		*saved_end_io;
-		} flush;
-	};
-
-	struct gendisk *rq_disk;
-	struct block_device *part;
-#ifdef CONFIG_BLK_RQ_ALLOC_TIME
-	/* Time that the first bio started allocating this request. */
-	u64 alloc_time_ns;
-#endif
-	/* Time that this request was allocated for this IO. */
-	u64 start_time_ns;
-	/* Time that I/O was submitted to the device. */
-	u64 io_start_time_ns;
-
-#ifdef CONFIG_BLK_WBT
-	unsigned short wbt_flags;
-#endif
-	/*
-	 * rq sectors used for blk stats. It has the same value
-	 * with blk_rq_sectors(rq), except that it never be zeroed
-	 * by completion.
-	 */
-	unsigned short stats_sectors;
-
-	/*
-	 * Number of scatter-gather DMA addr+len pairs after
-	 * physical address coalescing is performed.
-	 */
-	unsigned short nr_phys_segments;
-
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-	unsigned short nr_integrity_segments;
-#endif
-
-#ifdef CONFIG_BLK_INLINE_ENCRYPTION
-	struct bio_crypt_ctx *crypt_ctx;
-	struct blk_ksm_keyslot *crypt_keyslot;
-#endif
-
-	unsigned short write_hint;
-	unsigned short ioprio;
-
-	enum mq_rq_state state;
-	refcount_t ref;
-
-	unsigned int timeout;
-	unsigned long deadline;
-
-	union {
-		struct __call_single_data csd;
-		u64 fifo_time;
-	};
-
-	/*
-	 * completion callback.
-	 */
-	rq_end_io_fn *end_io;
-	void *end_io_data;
-};
-
 static inline bool blk_op_is_passthrough(unsigned int op)
 {
 	op &= REQ_OP_MASK;
 	return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
 }
 
-static inline bool blk_rq_is_passthrough(struct request *rq)
-{
-	return blk_op_is_passthrough(req_op(rq));
-}
-
-static inline unsigned short req_get_ioprio(struct request *req)
-{
-	return req->ioprio;
-}
-
-struct bio_vec;
-
-enum blk_eh_timer_return {
-	BLK_EH_DONE,		/* drivers has completed the command */
-	BLK_EH_RESET_TIMER,	/* reset timer and try again */
-};
-
-#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
-#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
-
 /*
  * Zoned block device models (zoned limit).
  *
@@ -620,11 +415,6 @@ extern void blk_clear_pm_only(struct request_queue *q);
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
-#define rq_data_dir(rq)		(op_is_write(req_op(rq)) ? WRITE : READ)
-
-#define rq_dma_dir(rq) \
-	(op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
-
 #define dma_map_bvec(dev, bv, dir, attrs) \
 	dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
 	(dir), (attrs))
@@ -740,11 +530,6 @@ static inline unsigned int queue_max_active_zones(const struct request_queue *q)
 }
 #endif /* CONFIG_BLK_DEV_ZONED */
 
-static inline bool rq_is_sync(struct request *rq)
-{
-	return op_is_sync(rq->cmd_flags);
-}
-
 static inline unsigned int blk_queue_depth(struct request_queue *q)
 {
 	if (q->queue_depth)
@@ -759,83 +544,20 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
 #define BLK_DEFAULT_SG_TIMEOUT	(60 * HZ)
 #define BLK_MIN_SG_TIMEOUT	(7 * HZ)
 
-struct rq_map_data {
-	struct page **pages;
-	int page_order;
-	int nr_entries;
-	unsigned long offset;
-	int null_mapped;
-	int from_user;
-};
-
-struct req_iterator {
-	struct bvec_iter iter;
-	struct bio *bio;
-};
-
 /* This should not be used directly - use rq_for_each_segment */
 #define for_each_bio(_bio)		\
 	for (; _bio; _bio = _bio->bi_next)
-#define __rq_for_each_bio(_bio, rq)	\
-	if ((rq->bio))			\
-		for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
-
-#define rq_for_each_segment(bvl, _rq, _iter)			\
-	__rq_for_each_bio(_iter.bio, _rq)			\
-		bio_for_each_segment(bvl, _iter.bio, _iter.iter)
 
-#define rq_for_each_bvec(bvl, _rq, _iter)			\
-	__rq_for_each_bio(_iter.bio, _rq)			\
-		bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
-
-#define rq_iter_last(bvec, _iter)				\
-		(_iter.bio->bi_next == NULL &&			\
-		 bio_iter_last(bvec, _iter.iter))
-
-#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-# error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
-#endif
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-extern void rq_flush_dcache_pages(struct request *rq);
-#else
-static inline void rq_flush_dcache_pages(struct request *rq)
-{
-}
-#endif
 
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 blk_qc_t submit_bio_noacct(struct bio *bio);
-extern void blk_rq_init(struct request_queue *q, struct request *rq);
-extern void blk_put_request(struct request *);
-extern struct request *blk_get_request(struct request_queue *, unsigned int op,
-				       blk_mq_req_flags_t flags);
+
 extern int blk_lld_busy(struct request_queue *q);
-extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-			     struct bio_set *bs, gfp_t gfp_mask,
-			     int (*bio_ctr)(struct bio *, struct bio *, void *),
-			     void *data);
-extern void blk_rq_unprep_clone(struct request *rq);
-extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
-				     struct request *rq);
-int blk_rq_append_bio(struct request *rq, struct bio *bio);
 extern void blk_queue_split(struct bio **);
 extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
-extern int blk_rq_map_user(struct request_queue *, struct request *,
-			   struct rq_map_data *, void __user *, unsigned long,
-			   gfp_t);
-extern int blk_rq_unmap_user(struct bio *);
-extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
-extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
-			       struct rq_map_data *, const struct iov_iter *,
-			       gfp_t);
-extern void blk_execute_rq_nowait(struct gendisk *,
-				  struct request *, int, rq_end_io_fn *);
-
-blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
-			    int at_head);
 
 /* Helper to convert REQ_OP_XXX to its string format XXX */
 extern const char *blk_op_str(unsigned int op);
@@ -867,47 +589,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 #define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
 #define SECTOR_MASK		(PAGE_SECTORS - 1)
 
-/*
- * blk_rq_pos()			: the current sector
- * blk_rq_bytes()		: bytes left in the entire request
- * blk_rq_cur_bytes()		: bytes left in the current segment
- * blk_rq_err_bytes()		: bytes left till the next error boundary
- * blk_rq_sectors()		: sectors left in the entire request
- * blk_rq_cur_sectors()		: sectors left in the current segment
- * blk_rq_stats_sectors()	: sectors of the entire request used for stats
- */
-static inline sector_t blk_rq_pos(const struct request *rq)
-{
-	return rq->__sector;
-}
-
-static inline unsigned int blk_rq_bytes(const struct request *rq)
-{
-	return rq->__data_len;
-}
-
-static inline int blk_rq_cur_bytes(const struct request *rq)
-{
-	return rq->bio ? bio_cur_bytes(rq->bio) : 0;
-}
-
-extern unsigned int blk_rq_err_bytes(const struct request *rq);
-
-static inline unsigned int blk_rq_sectors(const struct request *rq)
-{
-	return blk_rq_bytes(rq) >> SECTOR_SHIFT;
-}
-
-static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
-{
-	return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
-}
-
-static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
-{
-	return rq->stats_sectors;
-}
-
 #ifdef CONFIG_BLK_DEV_ZONED
 
 /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
@@ -924,42 +605,8 @@ static inline unsigned int bio_zone_is_seq(struct bio *bio)
 	return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev),
 				     bio->bi_iter.bi_sector);
 }
-
-static inline unsigned int blk_rq_zone_no(struct request *rq)
-{
-	return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
-}
-
-static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
-{
-	return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
-}
 #endif /* CONFIG_BLK_DEV_ZONED */
 
-/*
- * Some commands like WRITE SAME have a payload or data transfer size which
- * is different from the size of the request.  Any driver that supports such
- * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
- * calculate the data transfer size.
- */
-static inline unsigned int blk_rq_payload_bytes(struct request *rq)
-{
-	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
-		return rq->special_vec.bv_len;
-	return blk_rq_bytes(rq);
-}
-
-/*
- * Return the first full biovec in the request.  The caller needs to check that
- * there are any bvecs before calling this helper.
- */
-static inline struct bio_vec req_bvec(struct request *rq)
-{
-	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
-		return rq->special_vec;
-	return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
-}
-
 static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 						     int op)
 {
@@ -999,30 +646,6 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q,
 	return min(q->limits.max_sectors, chunk_sectors);
 }
 
-static inline unsigned int blk_rq_count_bios(struct request *rq)
-{
-	unsigned int nr_bios = 0;
-	struct bio *bio;
-
-	__rq_for_each_bio(bio, rq)
-		nr_bios++;
-
-	return nr_bios;
-}
-
-void blk_steal_bios(struct bio_list *list, struct request *rq);
-
-/*
- * Request completion related functions.
- *
- * blk_update_request() completes given number of bytes and updates
- * the request without completing it.
- */
-extern bool blk_update_request(struct request *rq, blk_status_t error,
-			       unsigned int nr_bytes);
-
-extern void blk_abort_request(struct request *);
-
 /*
  * Access functions for manipulating queue properties
  */
@@ -1081,42 +704,6 @@ extern void blk_queue_required_elevator_features(struct request_queue *q,
 extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
 					      struct device *dev);
 
-/*
- * Number of physical segments as sent to the device.
- *
- * Normally this is the number of discontiguous data segments sent by the
- * submitter.  But for data-less command like discard we might have no
- * actual data segments submitted, but the driver might have to add it's
- * own special payload.  In that case we still return 1 here so that this
- * special payload will be mapped.
- */
-static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
-{
-	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
-		return 1;
-	return rq->nr_phys_segments;
-}
-
-/*
- * Number of discard segments (or ranges) the driver needs to fill in.
- * Each discard bio merged into a request is counted as one segment.
- */
-static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
-{
-	return max_t(unsigned short, rq->nr_phys_segments, 1);
-}
-
-int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
-		struct scatterlist *sglist, struct scatterlist **last_sg);
-static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
-		struct scatterlist *sglist)
-{
-	struct scatterlist *last_sg = NULL;
-
-	return __blk_rq_map_sg(q, rq, sglist, &last_sg);
-}
-extern void blk_dump_rq_flags(struct request *, char *);
-
 bool __must_check blk_get_queue(struct request_queue *);
 extern void blk_put_queue(struct request_queue *);
 extern void blk_set_queue_dying(struct request_queue *);
@@ -1613,60 +1200,6 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 
-#ifdef CONFIG_BLK_DEV_ZONED
-bool blk_req_needs_zone_write_lock(struct request *rq);
-bool blk_req_zone_write_trylock(struct request *rq);
-void __blk_req_zone_write_lock(struct request *rq);
-void __blk_req_zone_write_unlock(struct request *rq);
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-	if (blk_req_needs_zone_write_lock(rq))
-		__blk_req_zone_write_lock(rq);
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-	if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
-		__blk_req_zone_write_unlock(rq);
-}
-
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
-	return rq->q->seq_zones_wlock &&
-		test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
-	if (!blk_req_needs_zone_write_lock(rq))
-		return true;
-	return !blk_req_zone_is_write_locked(rq);
-}
-#else
-static inline bool blk_req_needs_zone_write_lock(struct request *rq)
-{
-	return false;
-}
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-}
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
-	return false;
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
-	return true;
-}
-#endif /* CONFIG_BLK_DEV_ZONED */
-
 static inline void blk_wake_io_task(struct task_struct *waiter)
 {
 	/*
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index a083e15df608..22501a293fa5 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -2,7 +2,7 @@
 #ifndef BLKTRACE_H
 #define BLKTRACE_H
 
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/relay.h>
 #include <linux/compat.h>
 #include <uapi/linux/blktrace_api.h>
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index 96305a64a5a7..c635c2e014e3 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -3,7 +3,7 @@
 #define _LINUX_T10_PI_H
 
 #include <linux/types.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 
 /*
  * A T10 PI-capable target device can be formatted with different
-- 
cgit v1.2.3


From d2a27964e60ff18e637334864042af54de383902 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Tue, 5 Oct 2021 18:23:27 +0800
Subject: block: Rename BLKDEV_MAX_RQ -> BLKDEV_DEFAULT_RQ

It is a bit confusing that there is BLKDEV_MAX_RQ and MAX_SCHED_RQ, as
the name BLKDEV_MAX_RQ would imply the max requests always, which it is
not.

Rename to BLKDEV_MAX_RQ to BLKDEV_DEFAULT_RQ, matching its usage - that being
the default number of requests assigned when allocating a request queue.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/1633429419-228500-3-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index bd4086a6f28e..31cc41dfef6b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -12,7 +12,7 @@ struct blk_mq_tags;
 struct blk_flush_queue;
 
 #define BLKDEV_MIN_RQ	4
-#define BLKDEV_MAX_RQ	128	/* Default maximum */
+#define BLKDEV_DEFAULT_RQ	128
 
 typedef void (rq_end_io_fn)(struct request *, blk_status_t);
 
-- 
cgit v1.2.3


From e155b0c238b20f0a866f4334d292656665836c8a Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Tue, 5 Oct 2021 18:23:37 +0800
Subject: blk-mq: Use shared tags for shared sbitmap support

Currently we use separate sbitmap pairs and active_queues atomic_t for
shared sbitmap support.

However a full sets of static requests are used per HW queue, which is
quite wasteful, considering that the total number of requests usable at
any given time across all HW queues is limited by the shared sbitmap depth.

As such, it is considerably more memory efficient in the case of shared
sbitmap to allocate a set of static rqs per tag set or request queue, and
not per HW queue.

So replace the sbitmap pairs and active_queues atomic_t with a shared
tags per tagset and request queue, which will hold a set of shared static
rqs.

Since there is now no valid HW queue index to be passed to the blk_mq_ops
.init and .exit_request callbacks, pass an invalid index token. This
changes the semantics of the APIs, such that the callback would need to
validate the HW queue index before using it. Currently no user of shared
sbitmap actually uses the HW queue index (as would be expected).

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/1633429419-228500-13-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 15 +++++++--------
 include/linux/blkdev.h |  3 +--
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 31cc41dfef6b..faa20a19bfcc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -440,13 +440,11 @@ enum hctx_type {
  * @flags:	   Zero or more BLK_MQ_F_* flags.
  * @driver_data:   Pointer to data owned by the block driver that created this
  *		   tag set.
- * @active_queues_shared_sbitmap:
- * 		   number of active request queues per tag set.
- * @__bitmap_tags: A shared tags sbitmap, used over all hctx's
- * @__breserved_tags:
- *		   A shared reserved tags sbitmap, used over all hctx's
  * @tags:	   Tag sets. One tag set per hardware queue. Has @nr_hw_queues
  *		   elements.
+ * @shared_sbitmap_tags:
+ *		   Shared sbitmap set of tags. Has @nr_hw_queues elements. If
+ *		   set, shared by all @tags.
  * @tag_list_lock: Serializes tag_list accesses.
  * @tag_list:	   List of the request queues that use this tag set. See also
  *		   request_queue.tag_set_list.
@@ -463,12 +461,11 @@ struct blk_mq_tag_set {
 	unsigned int		timeout;
 	unsigned int		flags;
 	void			*driver_data;
-	atomic_t		active_queues_shared_sbitmap;
 
-	struct sbitmap_queue	__bitmap_tags;
-	struct sbitmap_queue	__breserved_tags;
 	struct blk_mq_tags	**tags;
 
+	struct blk_mq_tags	*shared_sbitmap_tags;
+
 	struct mutex		tag_list_lock;
 	struct list_head	tag_list;
 };
@@ -640,6 +637,8 @@ enum {
 	((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
 		<< BLK_MQ_F_ALLOC_POLICY_START_BIT)
 
+#define BLK_MQ_NO_HCTX_IDX	(-1U)
+
 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
 		struct lock_class_key *lkclass);
 #define blk_mq_alloc_disk(set, queuedata)				\
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0e960d74615e..cf92c13eb80e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -238,8 +238,7 @@ struct request_queue {
 
 	atomic_t		nr_active_requests_shared_sbitmap;
 
-	struct sbitmap_queue	sched_bitmap_tags;
-	struct sbitmap_queue	sched_breserved_tags;
+	struct blk_mq_tags	*shared_sbitmap_tags;
 
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP
-- 
cgit v1.2.3


From 079a2e3e862548087041a1873bbffceb41a72a33 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Tue, 5 Oct 2021 18:23:39 +0800
Subject: blk-mq: Change shared sbitmap naming to shared tags

Now that shared sbitmap support really means shared tags, rename symbols
to match that.

Signed-off-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1633429419-228500-15-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 8 ++++----
 include/linux/blkdev.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index faa20a19bfcc..75d75657df21 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -442,9 +442,9 @@ enum hctx_type {
  *		   tag set.
  * @tags:	   Tag sets. One tag set per hardware queue. Has @nr_hw_queues
  *		   elements.
- * @shared_sbitmap_tags:
- *		   Shared sbitmap set of tags. Has @nr_hw_queues elements. If
- *		   set, shared by all @tags.
+ * @shared_tags:
+ *		   Shared set of tags. Has @nr_hw_queues elements. If set,
+ *		   shared by all @tags.
  * @tag_list_lock: Serializes tag_list accesses.
  * @tag_list:	   List of the request queues that use this tag set. See also
  *		   request_queue.tag_set_list.
@@ -464,7 +464,7 @@ struct blk_mq_tag_set {
 
 	struct blk_mq_tags	**tags;
 
-	struct blk_mq_tags	*shared_sbitmap_tags;
+	struct blk_mq_tags	*shared_tags;
 
 	struct mutex		tag_list_lock;
 	struct list_head	tag_list;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cf92c13eb80e..b19172db7eef 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -236,9 +236,9 @@ struct request_queue {
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
 
-	atomic_t		nr_active_requests_shared_sbitmap;
+	atomic_t		nr_active_requests_shared_tags;
 
-	struct blk_mq_tags	*shared_sbitmap_tags;
+	struct blk_mq_tags	*sched_shared_tags;
 
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP
-- 
cgit v1.2.3


From ba0ffdd8ce48ad7f7e85191cd29f9674caca3745 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 6 Oct 2021 12:01:07 -0600
Subject: block: bump max plugged deferred size from 16 to 32

Particularly for NVMe with efficient deferred submission for many
requests, there are nice benefits to be seen by bumping the default max
plug count from 16 to 32. This is especially true for virtualized setups,
where the submit part is more expensive. But can be noticed even on
native hardware.

Reduce the multiple queue factor from 4 to 2, since we're changing the
default size.

While changing it, move the defines into the block layer private header.
These aren't values that anyone outside of the block layer uses, or
should use.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b19172db7eef..472b4ab007c6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -727,8 +727,6 @@ struct blk_plug {
 	bool multiple_queues;
 	bool nowait;
 };
-#define BLK_MAX_REQUEST_COUNT 16
-#define BLK_PLUG_FLUSH_SIZE (128 * 1024)
 
 struct blk_plug_cb;
 typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool);
-- 
cgit v1.2.3


From 47c122e35d7e43b14129ceb9ed3a7e67599978fa Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 6 Oct 2021 06:34:11 -0600
Subject: block: pre-allocate requests if plug is started and is a batch

The caller typically has a good (or even exact) idea of how many requests
it needs to submit. We can make the request/tag allocation a lot more
efficient if we just allocate N requests/tags upfront when we queue the
first bio from the batch.

Provide a new plug start helper that allows the caller to specify how many
IOs are expected. This sets plug->nr_ios, and we can use that for smarter
request allocation. The plug provides a holding spot for requests, and
request allocation will check it before calling into the normal request
allocation path.

The blk_finish_plug() is called, check if there are unused requests and
free them. This should not happen in normal operations. The exception is
if we get merging, then we may be left with requests that need freeing
when done.

This raises the per-core performance on my setup from ~5.8M to ~6.1M
IOPS.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h |  5 ++++-
 include/linux/blkdev.h | 15 ++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 75d75657df21..0e941f217578 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -90,7 +90,10 @@ struct request {
 	struct bio *bio;
 	struct bio *biotail;
 
-	struct list_head queuelist;
+	union {
+		struct list_head queuelist;
+		struct request *rq_next;
+	};
 
 	/*
 	 * The hash is used inside the scheduler, and killed once the
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 472b4ab007c6..17705c970d7e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -722,10 +722,17 @@ extern void blk_set_queue_dying(struct request_queue *);
  */
 struct blk_plug {
 	struct list_head mq_list; /* blk-mq requests */
-	struct list_head cb_list; /* md requires an unplug callback */
+
+	/* if ios_left is > 1, we can batch tag/rq allocations */
+	struct request *cached_rq;
+	unsigned short nr_ios;
+
 	unsigned short rq_count;
+
 	bool multiple_queues;
 	bool nowait;
+
+	struct list_head cb_list; /* md requires an unplug callback */
 };
 
 struct blk_plug_cb;
@@ -738,6 +745,7 @@ struct blk_plug_cb {
 extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
 					     void *data, int size);
 extern void blk_start_plug(struct blk_plug *);
+extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
 extern void blk_finish_plug(struct blk_plug *);
 extern void blk_flush_plug_list(struct blk_plug *, bool);
 
@@ -772,6 +780,11 @@ long nr_blockdev_pages(void);
 struct blk_plug {
 };
 
+static inline void blk_start_plug_nr_ios(struct blk_plug *plug,
+					 unsigned short nr_ios)
+{
+}
+
 static inline void blk_start_plug(struct blk_plug *plug)
 {
 }
-- 
cgit v1.2.3


From 84b8514b46b417e299746b1297d3d0899e8539f3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 12:44:49 +0200
Subject: block: move the *blkdev_ioctl declarations out of blkdev.h

These are only used inside of block/.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211012104450.659013-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 0b48a0cf4262..cd4038fd5743 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -281,10 +281,6 @@ bool bdev_check_media_change(struct block_device *bdev);
 int __invalidate_device(struct block_device *bdev, bool kill_dirty);
 void set_capacity(struct gendisk *disk, sector_t size);
 
-/* for drivers/char/raw.c: */
-int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
-long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
-
 #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
 int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
 void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
-- 
cgit v1.2.3


From 9e8c0d0d4d21d2c2c60132e2c18a73d08a230b42 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 18:17:57 +0200
Subject: block: remove BIO_BUG_ON

BIO_DEBUG is always defined, so just switch the two instances to use
BUG_ON directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211012161804.991559-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 00952e92eae1..65a356fa7110 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -11,14 +11,6 @@
 #include <linux/blk_types.h>
 #include <linux/uio.h>
 
-#define BIO_DEBUG
-
-#ifdef BIO_DEBUG
-#define BIO_BUG_ON	BUG_ON
-#else
-#define BIO_BUG_ON
-#endif
-
 #define BIO_MAX_VECS		256U
 
 static inline unsigned int bio_max_segs(unsigned int nr_segs)
-- 
cgit v1.2.3


From 11d9cab1ca6ed08747c6c5a274c6a8e8307aefbc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 18:17:58 +0200
Subject: block: don't include <linux/ioprio.h> in <linux/bio.h>

bio.h doesn't need any of the definitions from ioprio.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211012161804.991559-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 65a356fa7110..ae94794d07c9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -6,7 +6,6 @@
 #define __LINUX_BIO_H
 
 #include <linux/mempool.h>
-#include <linux/ioprio.h>
 /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
 #include <linux/blk_types.h>
 #include <linux/uio.h>
-- 
cgit v1.2.3


From 8addffd657a956944c1b8a74a1c9aabcfc5b4530 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 18:17:59 +0200
Subject: block: move bio_mergeable out of bio.h

bio_mergeable is only needed by I/O schedulers, so move it to
blk-mq-sched.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211012161804.991559-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index ae94794d07c9..ec255f282994 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -69,14 +69,6 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
 	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
 }
 
-static inline bool bio_mergeable(struct bio *bio)
-{
-	if (bio->bi_opf & REQ_NOMERGE_FLAGS)
-		return false;
-
-	return true;
-}
-
 static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
 	if (bio_has_data(bio))
-- 
cgit v1.2.3


From b6559d8f9fdd7f0e139161cffea2645bd8d084c6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 18:18:00 +0200
Subject: block: fold bio_cur_bytes into blk_rq_cur_bytes

Fold bio_cur_bytes into the only caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211012161804.991559-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h    | 8 --------
 include/linux/blk-mq.h | 6 +++++-
 2 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index ec255f282994..cffd5eba401e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -69,14 +69,6 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
 	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
 }
 
-static inline unsigned int bio_cur_bytes(struct bio *bio)
-{
-	if (bio_has_data(bio))
-		return bio_iovec(bio).bv_len;
-	else /* dataless requests such as discard */
-		return bio->bi_iter.bi_size;
-}
-
 static inline void *bio_data(struct bio *bio)
 {
 	if (bio_has_data(bio))
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0e941f217578..2219e9277118 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -927,7 +927,11 @@ static inline unsigned int blk_rq_bytes(const struct request *rq)
 
 static inline int blk_rq_cur_bytes(const struct request *rq)
 {
-	return rq->bio ? bio_cur_bytes(rq->bio) : 0;
+	if (!rq->bio)
+		return 0;
+	if (!bio_has_data(rq->bio))	/* dataless requests such as discard */
+		return rq->bio->bi_iter.bi_size;
+	return bio_iovec(rq->bio).bv_len;
 }
 
 unsigned int blk_rq_err_bytes(const struct request *rq);
-- 
cgit v1.2.3


From 9a6083becbe113ed1e28059ce659dc8ae71b33c3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 18:18:01 +0200
Subject: block: move bio_full out of bio.h

bio_full is only used in bio.c, so move it there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211012161804.991559-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index cffd5eba401e..2ffc7c768091 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -77,25 +77,6 @@ static inline void *bio_data(struct bio *bio)
 	return NULL;
 }
 
-/**
- * bio_full - check if the bio is full
- * @bio:	bio to check
- * @len:	length of one segment to be added
- *
- * Return true if @bio is full and one segment with @len bytes can't be
- * added to the bio, otherwise return false
- */
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
-		return true;
-
-	if (bio->bi_iter.bi_size > UINT_MAX - len)
-		return true;
-
-	return false;
-}
-
 static inline bool bio_next_segment(const struct bio *bio,
 				    struct bvec_iter_all *iter)
 {
-- 
cgit v1.2.3


From 9774b39175fe3606ce2a836a47134c53b75681c9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 18:18:02 +0200
Subject: block: mark __bio_try_merge_page static

Mark __bio_try_merge_page static and move it up a bit to avoid the need
for a forward declaration.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211012161804.991559-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2ffc7c768091..faa0a2fabaf0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -425,8 +425,6 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 int bio_add_zone_append_page(struct bio *bio, struct page *page,
 			     unsigned int len, unsigned int offset);
-bool __bio_try_merge_page(struct bio *bio, struct page *page,
-		unsigned int len, unsigned int off, bool *same_page);
 void __bio_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
-- 
cgit v1.2.3


From ff18d77b5f0cf3e25aa13741eed4d788a13c04d7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 18:18:03 +0200
Subject: block: move bio_get_{first,last}_bvec out of bio.h

bio_get_first_bvec and bio_get_last_bvec are only used in blk-merge.c,
so move them there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211012161804.991559-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 31 -------------------------------
 1 file changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index faa0a2fabaf0..4c5106f2ed57 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -221,37 +221,6 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
 	bio->bi_flags &= ~(1U << bit);
 }
 
-static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
-{
-	*bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
-}
-
-static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
-{
-	struct bvec_iter iter = bio->bi_iter;
-	int idx;
-
-	bio_get_first_bvec(bio, bv);
-	if (bv->bv_len == bio->bi_iter.bi_size)
-		return;		/* this bio only has a single bvec */
-
-	bio_advance_iter(bio, &iter, iter.bi_size);
-
-	if (!iter.bi_bvec_done)
-		idx = iter.bi_idx - 1;
-	else	/* in the middle of bvec */
-		idx = iter.bi_idx;
-
-	*bv = bio->bi_io_vec[idx];
-
-	/*
-	 * iter.bi_bvec_done records actual length of the last bvec
-	 * if this bio ends in the middle of one io vector
-	 */
-	if (iter.bi_bvec_done)
-		bv->bv_len = iter.bi_bvec_done;
-}
-
 static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
 {
 	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
-- 
cgit v1.2.3


From 4f7ab09a1ca0bc955635705c359ab3021b405fd0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 18:18:04 +0200
Subject: block: mark bio_truncate static

bio_truncate is only used in bio.c, so mark it static.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211012161804.991559-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4c5106f2ed57..d8d27742a75f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -405,7 +405,6 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
 			       struct bio *src, struct bvec_iter *src_iter);
 extern void bio_copy_data(struct bio *dst, struct bio *src);
 extern void bio_free_pages(struct bio *bio);
-void bio_truncate(struct bio *bio, unsigned new_size);
 void guard_bio_eod(struct bio *bio);
 void zero_fill_bio(struct bio *bio);
 
-- 
cgit v1.2.3


From 9672b0d43782047b1825a96bafee1b6aefa35bc2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 9 Oct 2021 13:02:23 -0600
Subject: sbitmap: add __sbitmap_queue_get_batch()

The block layer tag allocation batching still calls into sbitmap to get
each tag, but we can improve on that. Add __sbitmap_queue_get_batch(),
which returns a mask of tags all at once, along with an offset for
those tags.

An example return would be 0xff, where bits 0..7 are set, with
tag_offset == 128. The valid tags in this case would be 128..135.

A batch is specific to an individual sbitmap_map, hence it cannot be
larger than that. The requested number of tags is automatically reduced
to the max that can be satisfied with a single map.

On failure, 0 is returned. Caller should fall back to single tag
allocation at that point/

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 2713e689ad66..e30b56023ead 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -426,6 +426,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
  */
 int __sbitmap_queue_get(struct sbitmap_queue *sbq);
 
+/**
+ * __sbitmap_queue_get_batch() - Try to allocate a batch of free bits
+ * @sbq: Bitmap queue to allocate from.
+ * @nr_tags: number of tags requested
+ * @offset: offset to add to returned bits
+ *
+ * Return: Mask of allocated tags, 0 if none are found. Each tag allocated is
+ * a bit in the mask returned, and the caller must add @offset to the value to
+ * get the absolute tag value.
+ */
+unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
+					unsigned int *offset);
+
 /**
  * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
  * sbitmap_queue, limiting the depth used from each word, with preemption
-- 
cgit v1.2.3


From f70299f0d58e0e21f7f5f5ab27e601e8d3f0373e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 13:12:15 +0200
Subject: blk-mq: factor out a blk_qc_to_hctx helper

Add a helper to get the hctx from a request_queue and cookie, and fold
the blk_qc_t_to_queue_num helper into it as no other callers are left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Mark Wunderlich <mark.wunderlich@intel.com>
Link: https://lore.kernel.org/r/20211012111226.760968-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3b967053e9f5..000351c5312a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -505,11 +505,6 @@ static inline bool blk_qc_t_valid(blk_qc_t cookie)
 	return cookie != BLK_QC_T_NONE;
 }
 
-static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
-{
-	return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT;
-}
-
 static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
 {
 	return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
-- 
cgit v1.2.3


From efbabbe121f96d4b1a98a2c2ef5d2e8f7fb41c87 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 13:12:17 +0200
Subject: blk-mq: remove blk_qc_t_to_tag and blk_qc_t_is_internal

Merge both functions into their only caller to keep the blk-mq tag to
blk_qc_t mapping as private as possible in blk-mq.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Mark Wunderlich <mark.wunderlich@intel.com>
Link: https://lore.kernel.org/r/20211012111226.760968-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 000351c5312a..fb7c1477617b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -505,16 +505,6 @@ static inline bool blk_qc_t_valid(blk_qc_t cookie)
 	return cookie != BLK_QC_T_NONE;
 }
 
-static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
-{
-	return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
-}
-
-static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
-{
-	return (cookie & BLK_QC_T_INTERNAL) != 0;
-}
-
 struct blk_rq_stat {
 	u64 mean;
 	u64 min;
-- 
cgit v1.2.3


From 28a1ae6b9daba6ac65700eeb38479bd6fadec089 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 13:12:18 +0200
Subject: blk-mq: remove blk_qc_t_valid

Move the trivial check into the only caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Tested-by: Mark Wunderlich <mark.wunderlich@intel.com>
Link: https://lore.kernel.org/r/20211012111226.760968-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index fb7c1477617b..5017ba8fc539 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -500,11 +500,6 @@ typedef unsigned int blk_qc_t;
 #define BLK_QC_T_SHIFT		16
 #define BLK_QC_T_INTERNAL	(1U << 31)
 
-static inline bool blk_qc_t_valid(blk_qc_t cookie)
-{
-	return cookie != BLK_QC_T_NONE;
-}
-
 struct blk_rq_stat {
 	u64 mean;
 	u64 min;
-- 
cgit v1.2.3


From ef99b2d37666b7a600baab9e1c4944436652b0a2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 13:12:19 +0200
Subject: block: replace the spin argument to blk_iopoll with a flags argument

Switch the boolean spin argument to blk_poll to passing a set of flags
instead.  This will allow to control polling behavior in a more fine
grained way.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Mark Wunderlich <mark.wunderlich@intel.com>
Link: https://lore.kernel.org/r/20211012111226.760968-10-hch@lst.de
[axboe: adapt to changed io_uring iopoll]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 4 +++-
 include/linux/fs.h     | 2 +-
 include/linux/iomap.h  | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 17705c970d7e..e177346bc020 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -564,7 +564,9 @@ extern const char *blk_op_str(unsigned int op);
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
 
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
+/* only poll the hardware once, don't continue until a completion was found */
+#define BLK_POLL_ONESHOT		(1 << 0)
+int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e7a633353fd2..c443cddf414f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2075,7 +2075,7 @@ struct file_operations {
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
-	int (*iopoll)(struct kiocb *kiocb, bool spin);
+	int (*iopoll)(struct kiocb *kiocb, unsigned int flags);
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 24f8489583ca..1e86b65567c2 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -337,7 +337,7 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
 		unsigned int dio_flags);
 ssize_t iomap_dio_complete(struct iomap_dio *dio);
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
+int iomap_dio_iopoll(struct kiocb *kiocb, unsigned int flags);
 
 #ifdef CONFIG_SWAP
 struct file;
-- 
cgit v1.2.3


From d729cf9acb9311956c8a37113dcfa0160a2d9665 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 13:12:20 +0200
Subject: io_uring: don't sleep when polling for I/O

There is no point in sleeping for the expected I/O completion timeout
in the io_uring async polling model as we never poll for a specific
I/O.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Mark Wunderlich <mark.wunderlich@intel.com>
Link: https://lore.kernel.org/r/20211012111226.760968-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e177346bc020..2b80c98fc373 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -566,6 +566,8 @@ blk_status_t errno_to_blk_status(int errno);
 
 /* only poll the hardware once, don't continue until a completion was found */
 #define BLK_POLL_ONESHOT		(1 << 0)
+/* do not sleep to wait for the expected completion time */
+#define BLK_POLL_NOSLEEP		(1 << 1)
 int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
-- 
cgit v1.2.3


From 6ce913fe3eee14f40f778e85999c9e599dda8c6b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 13:12:21 +0200
Subject: block: rename REQ_HIPRI to REQ_POLLED

Unlike the RWF_HIPRI userspace ABI which is intentionally kept vague,
the bio flag is specific to the polling implementation, so rename and
document it properly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Tested-by: Mark Wunderlich <mark.wunderlich@intel.com>
Link: https://lore.kernel.org/r/20211012111226.760968-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h       | 2 +-
 include/linux/blk_types.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index d8d27742a75f..c7a2d880e927 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -706,7 +706,7 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
  */
 static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
 {
-	bio->bi_opf |= REQ_HIPRI;
+	bio->bi_opf |= REQ_POLLED;
 	if (!is_sync_kiocb(kiocb))
 		bio->bi_opf |= REQ_NOWAIT;
 }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 5017ba8fc539..f8b9fce68834 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -384,7 +384,7 @@ enum req_flag_bits {
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
 
-	__REQ_HIPRI,
+	__REQ_POLLED,		/* caller polls for completion using blk_poll */
 
 	/* for driver use */
 	__REQ_DRV,
@@ -409,7 +409,7 @@ enum req_flag_bits {
 #define REQ_CGROUP_PUNT		(1ULL << __REQ_CGROUP_PUNT)
 
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
-#define REQ_HIPRI		(1ULL << __REQ_HIPRI)
+#define REQ_POLLED		(1ULL << __REQ_POLLED)
 
 #define REQ_DRV			(1ULL << __REQ_DRV)
 #define REQ_SWAP		(1ULL << __REQ_SWAP)
-- 
cgit v1.2.3


From 19416123ab3e1348b3532347af221d8f60838431 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 12 Oct 2021 13:12:23 +0200
Subject: block: define 'struct bvec_iter' as packed

'struct bvec_iter' is embedded into 'struct bio', define it as packed
so that we can get one extra 4bytes for other uses without expanding
bio.

'struct bvec_iter' is often allocated on stack, so making it packed
doesn't affect performance. Also I have run io_uring on both
nvme/null_blk, and not observe performance effect in this way.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Mark Wunderlich <mark.wunderlich@intel.com>
Link: https://lore.kernel.org/r/20211012111226.760968-14-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bvec.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 0e9bdd42dafb..35c25dff651a 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -44,7 +44,7 @@ struct bvec_iter {
 
 	unsigned int            bi_bvec_done;	/* number of bytes completed in
 						   current bvec */
-};
+} __packed;
 
 struct bvec_iter_all {
 	struct bio_vec	bv;
-- 
cgit v1.2.3


From 3e08773c3841e9db7a520908cc2b136a77d275ff Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Oct 2021 13:12:24 +0200
Subject: block: switch polling to be bio based

Replace the blk_poll interface that requires the caller to keep a queue
and cookie from the submissions with polling based on the bio.

Polling for the bio itself leads to a few advantages:

 - the cookie construction can made entirely private in blk-mq.c
 - the caller does not need to remember the request_queue and cookie
   separately and thus sidesteps their lifetime issues
 - keeping the device and the cookie inside the bio allows to trivially
   support polling BIOs remapping by stacking drivers
 - a lot of code to propagate the cookie back up the submission path can
   be removed entirely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Mark Wunderlich <mark.wunderlich@intel.com>
Link: https://lore.kernel.org/r/20211012111226.760968-15-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h       |  2 +-
 include/linux/blk-mq.h    | 15 ++-------------
 include/linux/blk_types.h | 12 +++++-------
 include/linux/blkdev.h    |  8 +++++---
 include/linux/fs.h        |  6 +-----
 include/linux/iomap.h     |  5 ++---
 6 files changed, 16 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index c7a2d880e927..62d684b7dd4c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -349,7 +349,7 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs)
 	return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
 }
 
-extern blk_qc_t submit_bio(struct bio *);
+void submit_bio(struct bio *bio);
 
 extern void bio_endio(struct bio *);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2219e9277118..a9c1d0882550 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -359,9 +359,9 @@ struct blk_mq_hw_ctx {
 	/** @kobj: Kernel object for sysfs. */
 	struct kobject		kobj;
 
-	/** @poll_considered: Count times blk_poll() was called. */
+	/** @poll_considered: Count times blk_mq_poll() was called. */
 	unsigned long		poll_considered;
-	/** @poll_invoked: Count how many requests blk_poll() polled. */
+	/** @poll_invoked: Count how many requests blk_mq_poll() polled. */
 	unsigned long		poll_invoked;
 	/** @poll_success: Count how many polled requests were completed. */
 	unsigned long		poll_success;
@@ -815,16 +815,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 	for ((i) = 0; (i) < (hctx)->nr_ctx &&				\
 	     ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
 
-static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
-		struct request *rq)
-{
-	if (rq->tag != -1)
-		return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT);
-
-	return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) |
-			BLK_QC_T_INTERNAL;
-}
-
 static inline void blk_mq_cleanup_rq(struct request *rq)
 {
 	if (rq->q->mq_ops->cleanup_rq)
@@ -843,7 +833,6 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
 		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 
-blk_qc_t blk_mq_submit_bio(struct bio *bio);
 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
 		struct lock_class_key *key);
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index f8b9fce68834..72736b4c057c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -208,6 +208,9 @@ static inline void bio_issue_init(struct bio_issue *issue,
 			((u64)size << BIO_ISSUE_SIZE_SHIFT));
 }
 
+typedef unsigned int blk_qc_t;
+#define BLK_QC_T_NONE		-1U
+
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
@@ -227,8 +230,8 @@ struct bio {
 
 	struct bvec_iter	bi_iter;
 
+	blk_qc_t		bi_cookie;
 	bio_end_io_t		*bi_end_io;
-
 	void			*bi_private;
 #ifdef CONFIG_BLK_CGROUP
 	/*
@@ -384,7 +387,7 @@ enum req_flag_bits {
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
 
-	__REQ_POLLED,		/* caller polls for completion using blk_poll */
+	__REQ_POLLED,		/* caller polls for completion using bio_poll */
 
 	/* for driver use */
 	__REQ_DRV,
@@ -495,11 +498,6 @@ static inline int op_stat_group(unsigned int op)
 	return op_is_write(op);
 }
 
-typedef unsigned int blk_qc_t;
-#define BLK_QC_T_NONE		-1U
-#define BLK_QC_T_SHIFT		16
-#define BLK_QC_T_INTERNAL	(1U << 31)
-
 struct blk_rq_stat {
 	u64 mean;
 	u64 min;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2b80c98fc373..2a8689e949b4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -25,6 +25,7 @@ struct request;
 struct sg_io_hdr;
 struct blkcg_gq;
 struct blk_flush_queue;
+struct kiocb;
 struct pr_ops;
 struct rq_qos;
 struct blk_queue_stats;
@@ -550,7 +551,7 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
 
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
-blk_qc_t submit_bio_noacct(struct bio *bio);
+void submit_bio_noacct(struct bio *bio);
 
 extern int blk_lld_busy(struct request_queue *q);
 extern void blk_queue_split(struct bio **);
@@ -568,7 +569,8 @@ blk_status_t errno_to_blk_status(int errno);
 #define BLK_POLL_ONESHOT		(1 << 0)
 /* do not sleep to wait for the expected completion time */
 #define BLK_POLL_NOSLEEP		(1 << 1)
-int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags);
+int bio_poll(struct bio *bio, unsigned int flags);
+int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
@@ -1176,7 +1178,7 @@ static inline void blk_ksm_unregister(struct request_queue *q) { }
 
 
 struct block_device_operations {
-	blk_qc_t (*submit_bio) (struct bio *bio);
+	void (*submit_bio)(struct bio *bio);
 	int (*open) (struct block_device *, fmode_t);
 	void (*release) (struct gendisk *, fmode_t);
 	int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c443cddf414f..f595f4097cb7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -334,11 +334,7 @@ struct kiocb {
 	int			ki_flags;
 	u16			ki_hint;
 	u16			ki_ioprio; /* See linux/ioprio.h */
-	union {
-		unsigned int		ki_cookie; /* for ->iopoll */
-		struct wait_page_queue	*ki_waitq; /* for async buffered IO */
-	};
-
+	struct wait_page_queue	*ki_waitq; /* for async buffered IO */
 	randomized_struct_fields_end
 };
 
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1e86b65567c2..63f4ea4dac9b 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -313,8 +313,8 @@ int iomap_writepages(struct address_space *mapping,
 struct iomap_dio_ops {
 	int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
 		      unsigned flags);
-	blk_qc_t (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
-			      loff_t file_offset);
+	void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
+		          loff_t file_offset);
 };
 
 /*
@@ -337,7 +337,6 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
 		unsigned int dio_flags);
 ssize_t iomap_dio_complete(struct iomap_dio *dio);
-int iomap_dio_iopoll(struct kiocb *kiocb, unsigned int flags);
 
 #ifdef CONFIG_SWAP
 struct file;
-- 
cgit v1.2.3


From 17220ca5ce9606c1b015c4316fca18734c2df0bb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 14 Oct 2021 15:03:26 +0100
Subject: block: cache request queue in bdev

There are tons of places where we need to get a request_queue only
having bdev, which turns into bdev->bd_disk->queue. There are probably a
hundred of such places considering inline helpers, and enough of them
are in hot paths.

Cache queue pointer in struct block_device and make use of it in
bdev_get_queue().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a3bfaecdd28956f03629d0ca5c63ebc096e1c809.1634219547.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 1 +
 include/linux/blkdev.h    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 72736b4c057c..1e370929c89e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -38,6 +38,7 @@ struct block_device {
 	u8			bd_partno;
 	spinlock_t		bd_size_lock; /* for bd_inode->i_size updates */
 	struct gendisk *	bd_disk;
+	struct request_queue *	bd_queue;
 
 	/* The counter of freeze processes */
 	int			bd_fsfreeze_count;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2a8689e949b4..d5b21fc8f49e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -574,7 +574,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
-	return bdev->bd_disk->queue;	/* this is never NULL */
+	return bdev->bd_queue;	/* this is never NULL */
 }
 
 /*
-- 
cgit v1.2.3


From b5f9c644eb1baafcd349ad134e2110773f8d0a38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Mon, 4 Oct 2021 14:59:35 +0200
Subject: PCI: Remove struct pci_dev->driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no remaining uses of the struct pci_dev->driver pointer, so
remove it.

Link: https://lore.kernel.org/r/20211004125935.2300113-12-u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index a2d62165a7f7..03bfdb25a55c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -342,7 +342,6 @@ struct pci_dev {
 	u16		pcie_flags_reg;	/* Cached PCIe Capabilities Register */
 	unsigned long	*dma_alias_mask;/* Mask of enabled devfn aliases */
 
-	struct pci_driver *driver;	/* Driver bound to this device */
 	u64		dma_mask;	/* Mask of the bits of bus address this
 					   device implements.  Normally this is
 					   0xffffffff.  You only need to change
-- 
cgit v1.2.3


From bb523b406c849eef8f265a07cd7f320f1f177743 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 2 Aug 2021 13:44:20 +0200
Subject: gup: Turn fault_in_pages_{readable,writeable} into
 fault_in_{readable,writeable}

Turn fault_in_pages_{readable,writeable} into versions that return the
number of bytes not faulted in, similar to copy_to_user, instead of
returning a non-zero value when any of the requested pages couldn't be
faulted in.  This supports the existing users that require all pages to
be faulted in as well as new users that are happy if any pages can be
faulted in.

Rename the functions to fault_in_{readable,writeable} to make sure
this change doesn't silently break things.

Neither of these functions is entirely trivial and it doesn't seem
useful to inline them, so move them to mm/gup.c.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 include/linux/pagemap.h | 57 +++----------------------------------------------
 1 file changed, 3 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 62db6b0176b9..9fe94f7a4f7e 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -733,61 +733,10 @@ int wait_on_page_private_2_killable(struct page *page);
 extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
 
 /*
- * Fault everything in given userspace address range in.
+ * Fault in userspace address range.
  */
-static inline int fault_in_pages_writeable(char __user *uaddr, size_t size)
-{
-	char __user *end = uaddr + size - 1;
-
-	if (unlikely(size == 0))
-		return 0;
-
-	if (unlikely(uaddr > end))
-		return -EFAULT;
-	/*
-	 * Writing zeroes into userspace here is OK, because we know that if
-	 * the zero gets there, we'll be overwriting it.
-	 */
-	do {
-		if (unlikely(__put_user(0, uaddr) != 0))
-			return -EFAULT;
-		uaddr += PAGE_SIZE;
-	} while (uaddr <= end);
-
-	/* Check whether the range spilled into the next page. */
-	if (((unsigned long)uaddr & PAGE_MASK) ==
-			((unsigned long)end & PAGE_MASK))
-		return __put_user(0, end);
-
-	return 0;
-}
-
-static inline int fault_in_pages_readable(const char __user *uaddr, size_t size)
-{
-	volatile char c;
-	const char __user *end = uaddr + size - 1;
-
-	if (unlikely(size == 0))
-		return 0;
-
-	if (unlikely(uaddr > end))
-		return -EFAULT;
-
-	do {
-		if (unlikely(__get_user(c, uaddr) != 0))
-			return -EFAULT;
-		uaddr += PAGE_SIZE;
-	} while (uaddr <= end);
-
-	/* Check whether the range spilled into the next page. */
-	if (((unsigned long)uaddr & PAGE_MASK) ==
-			((unsigned long)end & PAGE_MASK)) {
-		return __get_user(c, end);
-	}
-
-	(void)c;
-	return 0;
-}
+size_t fault_in_writeable(char __user *uaddr, size_t size);
+size_t fault_in_readable(const char __user *uaddr, size_t size);
 
 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
-- 
cgit v1.2.3


From a6294593e8a1290091d0b078d5d33da5e0cd3dfe Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 2 Aug 2021 14:54:16 +0200
Subject: iov_iter: Turn iov_iter_fault_in_readable into
 fault_in_iov_iter_readable

Turn iov_iter_fault_in_readable into a function that returns the number
of bytes not faulted in, similar to copy_to_user, instead of returning a
non-zero value when any of the requested pages couldn't be faulted in.
This supports the existing users that require all pages to be faulted in
as well as new users that are happy if any pages can be faulted in.

Rename iov_iter_fault_in_readable to fault_in_iov_iter_readable to make
sure this change doesn't silently break things.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 include/linux/uio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 207101a9c5c3..d18458af6681 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -133,7 +133,7 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
 				  size_t bytes, struct iov_iter *i);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
 void iov_iter_revert(struct iov_iter *i, size_t bytes);
-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes);
+size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i);
-- 
cgit v1.2.3


From d4aa57a1cac3c99ffd641f7c8e0a7aff5656de0d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 13 Oct 2021 09:01:43 -0600
Subject: block: don't bother iter advancing a fully done bio

If we're completing nbytes and nbytes is the size of the bio, don't bother
with calling into the iterator increment helpers. Just clear the bio
size and we're done.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 62d684b7dd4c..9538f20ffaa5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -119,6 +119,28 @@ static inline void bio_advance_iter_single(const struct bio *bio,
 		bvec_iter_advance_single(bio->bi_io_vec, iter, bytes);
 }
 
+void __bio_advance(struct bio *, unsigned bytes);
+
+/**
+ * bio_advance - increment/complete a bio by some number of bytes
+ * @bio:	bio to advance
+ * @bytes:	number of bytes to complete
+ *
+ * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
+ * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
+ * be updated on the last bvec as well.
+ *
+ * @bio will then represent the remaining, uncompleted portion of the io.
+ */
+static inline void bio_advance(struct bio *bio, unsigned int nbytes)
+{
+	if (nbytes == bio->bi_iter.bi_size) {
+		bio->bi_iter.bi_size = 0;
+		return;
+	}
+	__bio_advance(bio, nbytes);
+}
+
 #define __bio_for_each_segment(bvl, bio, iter, start)			\
 	for (iter = (start);						\
 	     (iter).bi_size &&						\
@@ -381,8 +403,6 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
 struct request_queue;
 
 extern int submit_bio_wait(struct bio *bio);
-extern void bio_advance(struct bio *, unsigned);
-
 extern void bio_init(struct bio *bio, struct bio_vec *table,
 		     unsigned short max_vecs);
 extern void bio_uninit(struct bio *);
-- 
cgit v1.2.3


From b60876296847e6cd7f1da4b8b7f0f31399d59aa1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Oct 2021 15:03:52 -0600
Subject: block: improve layout of struct request

It's been a while since this was analyzed, move some members around to
better flow with the use case. Initial state up top, and queued state
after that. This improves my peak case by about 1.5%, from 7750K to
7900K IOPS.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 90 ++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a9c1d0882550..8ca9728cc7f2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -83,6 +83,8 @@ struct request {
 	int tag;
 	int internal_tag;
 
+	unsigned int timeout;
+
 	/* the following two fields are internal, NEVER access directly */
 	unsigned int __data_len;	/* total data len */
 	sector_t __sector;		/* sector cursor */
@@ -95,49 +97,6 @@ struct request {
 		struct request *rq_next;
 	};
 
-	/*
-	 * The hash is used inside the scheduler, and killed once the
-	 * request reaches the dispatch list. The ipi_list is only used
-	 * to queue the request for softirq completion, which is long
-	 * after the request has been unhashed (and even removed from
-	 * the dispatch list).
-	 */
-	union {
-		struct hlist_node hash;	/* merge hash */
-		struct llist_node ipi_list;
-	};
-
-	/*
-	 * The rb_node is only used inside the io scheduler, requests
-	 * are pruned when moved to the dispatch queue. So let the
-	 * completion_data share space with the rb_node.
-	 */
-	union {
-		struct rb_node rb_node;	/* sort/lookup */
-		struct bio_vec special_vec;
-		void *completion_data;
-		int error_count; /* for legacy drivers, don't use */
-	};
-
-	/*
-	 * Three pointers are available for the IO schedulers, if they need
-	 * more they have to dynamically allocate it.  Flush requests are
-	 * never put on the IO scheduler. So let the flush fields share
-	 * space with the elevator data.
-	 */
-	union {
-		struct {
-			struct io_cq		*icq;
-			void			*priv[2];
-		} elv;
-
-		struct {
-			unsigned int		seq;
-			struct list_head	list;
-			rq_end_io_fn		*saved_end_io;
-		} flush;
-	};
-
 	struct gendisk *rq_disk;
 	struct block_device *part;
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
@@ -180,9 +139,52 @@ struct request {
 	enum mq_rq_state state;
 	refcount_t ref;
 
-	unsigned int timeout;
 	unsigned long deadline;
 
+	/*
+	 * The hash is used inside the scheduler, and killed once the
+	 * request reaches the dispatch list. The ipi_list is only used
+	 * to queue the request for softirq completion, which is long
+	 * after the request has been unhashed (and even removed from
+	 * the dispatch list).
+	 */
+	union {
+		struct hlist_node hash;	/* merge hash */
+		struct llist_node ipi_list;
+	};
+
+	/*
+	 * The rb_node is only used inside the io scheduler, requests
+	 * are pruned when moved to the dispatch queue. So let the
+	 * completion_data share space with the rb_node.
+	 */
+	union {
+		struct rb_node rb_node;	/* sort/lookup */
+		struct bio_vec special_vec;
+		void *completion_data;
+		int error_count; /* for legacy drivers, don't use */
+	};
+
+
+	/*
+	 * Three pointers are available for the IO schedulers, if they need
+	 * more they have to dynamically allocate it.  Flush requests are
+	 * never put on the IO scheduler. So let the flush fields share
+	 * space with the elevator data.
+	 */
+	union {
+		struct {
+			struct io_cq		*icq;
+			void			*priv[2];
+		} elv;
+
+		struct {
+			unsigned int		seq;
+			struct list_head	list;
+			rq_end_io_fn		*saved_end_io;
+		} flush;
+	};
+
 	union {
 		struct __call_single_data csd;
 		u64 fifo_time;
-- 
cgit v1.2.3


From 2ff0682da6e09c1e0db63a2d2abcd4efb531c8db Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Oct 2021 09:44:38 -0600
Subject: block: store elevator state in request

Add an rq private RQF_ELV flag, which tells the block layer that this
request was initialized on a queue that has an IO scheduler attached.
This allows for faster checking in the fast path, rather than having to
deference rq->q later on.

Elevator switching does full quiesce of the queue before detaching an
IO scheduler, so it's safe to cache this in the request itself.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8ca9728cc7f2..95c3bd3a008e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -55,6 +55,8 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 20))
 /* ->timeout has been called, don't expire again */
 #define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
+/* queue has elevator attached */
+#define RQF_ELV			((__force req_flags_t)(1 << 22))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
-- 
cgit v1.2.3


From 0d20e9fb1262b1f9ac895b287db892bc75b05b84 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 18 Oct 2021 17:19:31 +0200
Subject: rtc: add BSM parameter

BSM or Backup Switch Mode is a common feature on RTCs, allowing to select
how the RTC will decide when to switch from its primary power supply to the
backup power supply. It is necessary to be able to set it from userspace as
there are uses cases where it has to be done dynamically.

Supported values are:
  RTC_BSM_DISABLED: disabled
  RTC_BSM_DIRECT: switching will happen as soon as Vbackup > Vdd
  RTC_BSM_LEVEL: switching will happen around a threshold, usually with an
  hysteresis
  RTC_BSM_STANDBY: switching will not happen until Vdd > Vbackup, this is
  useful to ensure the RTC doesn't draw any power until the device is first
  powered on.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20211018151933.76865-6-alexandre.belloni@bootlin.com
---
 include/linux/rtc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 354e0843ab17..47fd1c2d3a57 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -66,6 +66,8 @@ struct rtc_class_ops {
 	int (*alarm_irq_enable)(struct device *, unsigned int enabled);
 	int (*read_offset)(struct device *, long *offset);
 	int (*set_offset)(struct device *, long offset);
+	int (*param_get)(struct device *, struct rtc_param *param);
+	int (*param_set)(struct device *, struct rtc_param *param);
 };
 
 struct rtc_device;
-- 
cgit v1.2.3


From 88dee3b0efe4b769fb22ce2e963aabae403e6801 Mon Sep 17 00:00:00 2001
From: Cai Huoqing <caihuoqing@baidu.com>
Date: Mon, 18 Oct 2021 20:41:09 +0800
Subject: PCI: Remove unused pci_pool wrappers

The pci_pool users have been converted to dma_pool.  Remove the unused
pci_pool wrappers.

Link: https://lore.kernel.org/r/20211018124110.214-1-caihuoqing@baidu.com
Signed-off-by: Cai Huoqing <caihuoqing@baidu.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index cd8aa6fce204..2ae9dd7f975f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1498,19 +1498,8 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
 #define PCI_IRQ_ALL_TYPES \
 	(PCI_IRQ_LEGACY | PCI_IRQ_MSI | PCI_IRQ_MSIX)
 
-/* kmem_cache style wrapper around pci_alloc_consistent() */
-
 #include <linux/dmapool.h>
 
-#define	pci_pool dma_pool
-#define pci_pool_create(name, pdev, size, align, allocation) \
-		dma_pool_create(name, &pdev->dev, size, align, allocation)
-#define	pci_pool_destroy(pool) dma_pool_destroy(pool)
-#define	pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
-#define	pci_pool_zalloc(pool, flags, handle) \
-		dma_pool_zalloc(pool, flags, handle)
-#define	pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
-
 struct msix_entry {
 	u32	vector;	/* Kernel uses to write allocated vector */
 	u16	entry;	/* Driver uses to specify entry, OS writes */
-- 
cgit v1.2.3


From 4797632f4f1d8af4e0670adcb97bf9800dc3beca Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 17 May 2021 20:16:57 -0700
Subject: string.h: Introduce memset_after() for wiping trailing
 members/padding

A common idiom in kernel code is to wipe the contents of a structure
after a given member. This is especially useful in places where there is
trailing padding. These open-coded cases are usually difficult to read
and very sensitive to struct layout changes. Introduce a new helper,
memset_after() that takes the target struct instance, the byte to write,
and the member name after which the zeroing should start.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Francis Laniel <laniel_francis@privacyrequired.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Daniel Axtens <dja@axtens.net>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/string.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index ac1c769a5a80..da490c2154a9 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -271,6 +271,23 @@ static inline void memcpy_and_pad(void *dest, size_t dest_len,
 		memcpy(dest, src, dest_len);
 }
 
+/**
+ * memset_after - Set a value after a struct member to the end of a struct
+ *
+ * @obj: Address of target struct instance
+ * @v: Byte value to repeatedly write
+ * @member: after which struct member to start writing bytes
+ *
+ * This is good for clearing padding following the given member.
+ */
+#define memset_after(obj, v, member)					\
+({									\
+	u8 *__ptr = (u8 *)(obj);					\
+	typeof(v) __val = (v);						\
+	memset(__ptr + offsetofend(typeof(*(obj)), member), __val,	\
+	       sizeof(*(obj)) - offsetofend(typeof(*(obj)), member));	\
+})
+
 /**
  * str_has_prefix - Test if a string has a given prefix
  * @str: The string to test
-- 
cgit v1.2.3


From 6dbefad40815a61aecbcf9b552e87ef57ab8cc7d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 17 May 2021 20:16:57 -0700
Subject: string.h: Introduce memset_startat() for wiping trailing members and
 padding

A common idiom in kernel code is to wipe the contents of a structure
starting from a given member. These open-coded cases are usually difficult
to read and very sensitive to struct layout changes. Like memset_after(),
introduce a new helper, memset_startat() that takes the target struct
instance, the byte to write, and the member name where zeroing should
start.

Note that this doesn't zero padding preceding the target member. For
those cases, memset_after() should be used on the preceding member.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Francis Laniel <laniel_francis@privacyrequired.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Daniel Axtens <dja@axtens.net>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/string.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index da490c2154a9..5a36608144a9 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -288,6 +288,24 @@ static inline void memcpy_and_pad(void *dest, size_t dest_len,
 	       sizeof(*(obj)) - offsetofend(typeof(*(obj)), member));	\
 })
 
+/**
+ * memset_startat - Set a value starting at a member to the end of a struct
+ *
+ * @obj: Address of target struct instance
+ * @v: Byte value to repeatedly write
+ * @member: struct member to start writing at
+ *
+ * Note that if there is padding between the prior member and the target
+ * member, memset_after() should be used to clear the prior padding.
+ */
+#define memset_startat(obj, v, member)					\
+({									\
+	u8 *__ptr = (u8 *)(obj);					\
+	typeof(v) __val = (v);						\
+	memset(__ptr + offsetof(typeof(*(obj)), member), __val,		\
+	       sizeof(*(obj)) - offsetof(typeof(*(obj)), member));	\
+})
+
 /**
  * str_has_prefix - Test if a string has a given prefix
  * @str: The string to test
-- 
cgit v1.2.3


From 3080ea5553cc909b000d1f1d964a9041962f2c5b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 9 Aug 2021 11:21:23 -0700
Subject: stddef: Introduce DECLARE_FLEX_ARRAY() helper

There are many places where kernel code wants to have several different
typed trailing flexible arrays. This would normally be done with multiple
flexible arrays in a union, but since GCC and Clang don't (on the surface)
allow this, there have been many open-coded workarounds, usually involving
neighboring 0-element arrays at the end of a structure. For example,
instead of something like this:

struct thing {
	...
	union {
		struct type1 foo[];
		struct type2 bar[];
	};
};

code works around the compiler with:

struct thing {
	...
	struct type1 foo[0];
	struct type2 bar[];
};

Another case is when a flexible array is wanted as the single member
within a struct (which itself is usually in a union). For example, this
would be worked around as:

union many {
	...
	struct {
		struct type3 baz[0];
	};
};

These kinds of work-arounds cause problems with size checks against such
zero-element arrays (for example when building with -Warray-bounds and
-Wzero-length-bounds, and with the coming FORTIFY_SOURCE improvements),
so they must all be converted to "real" flexible arrays, avoiding warnings
like this:

fs/hpfs/anode.c: In function 'hpfs_add_sector_to_btree':
fs/hpfs/anode.c:209:27: warning: array subscript 0 is outside the bounds of an interior zero-length array 'struct bplus_internal_node[0]' [-Wzero-length-bounds]
  209 |    anode->btree.u.internal[0].down = cpu_to_le32(a);
      |    ~~~~~~~~~~~~~~~~~~~~~~~^~~
In file included from fs/hpfs/hpfs_fn.h:26,
                 from fs/hpfs/anode.c:10:
fs/hpfs/hpfs.h:412:32: note: while referencing 'internal'
  412 |     struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving
      |                                ^~~~~~~~

drivers/net/can/usb/etas_es58x/es58x_fd.c: In function 'es58x_fd_tx_can_msg':
drivers/net/can/usb/etas_es58x/es58x_fd.c:360:35: warning: array subscript 65535 is outside the bounds of an interior zero-length array 'u8[0]' {aka 'unsigned char[]'} [-Wzero-length-bounds]
  360 |  tx_can_msg = (typeof(tx_can_msg))&es58x_fd_urb_cmd->raw_msg[msg_len];
      |                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from drivers/net/can/usb/etas_es58x/es58x_core.h:22,
                 from drivers/net/can/usb/etas_es58x/es58x_fd.c:17:
drivers/net/can/usb/etas_es58x/es58x_fd.h:231:6: note: while referencing 'raw_msg'
  231 |   u8 raw_msg[0];
      |      ^~~~~~~

However, it _is_ entirely possible to have one or more flexible arrays
in a struct or union: it just has to be in another struct. And since it
cannot be alone in a struct, such a struct must have at least 1 other
named member -- but that member can be zero sized. Wrap all this nonsense
into the new DECLARE_FLEX_ARRAY() in support of having flexible arrays
in unions (or alone in a struct).

As with struct_group(), since this is needed in UAPI headers as well,
implement the core there, with a non-UAPI wrapper.

Additionally update kernel-doc to understand its existence.

https://github.com/KSPP/linux/issues/137

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/stddef.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stddef.h b/include/linux/stddef.h
index 8b103a53b000..ca507bd5f808 100644
--- a/include/linux/stddef.h
+++ b/include/linux/stddef.h
@@ -84,4 +84,17 @@ enum {
 #define struct_group_tagged(TAG, NAME, MEMBERS...) \
 	__struct_group(TAG, NAME, /* no attrs */, MEMBERS)
 
+/**
+ * DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union
+ *
+ * @TYPE: The type of each flexible array element
+ * @NAME: The name of the flexible array member
+ *
+ * In order to have a flexible array member in a union or alone in a
+ * struct, it needs to be wrapped in an anonymous struct with at least 1
+ * named member, but that member can be empty.
+ */
+#define DECLARE_FLEX_ARRAY(TYPE, NAME) \
+	__DECLARE_FLEX_ARRAY(TYPE, NAME)
+
 #endif
-- 
cgit v1.2.3


From fa7845cfd53f3b1d3f60efa55db89805595bc045 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 9 Aug 2021 11:29:33 -0700
Subject: treewide: Replace open-coded flex arrays in unions

In support of enabling -Warray-bounds and -Wzero-length-bounds and
correctly handling run-time memcpy() bounds checking, replace all
open-coded flexible arrays (i.e. 0-element arrays) in unions with the
DECLARE_FLEX_ARRAY() helper macro.

This fixes warnings such as:

fs/hpfs/anode.c: In function 'hpfs_add_sector_to_btree':
fs/hpfs/anode.c:209:27: warning: array subscript 0 is outside the bounds of an interior zero-length array 'struct bplus_internal_node[0]' [-Wzero-length-bounds]
  209 |    anode->btree.u.internal[0].down = cpu_to_le32(a);
      |    ~~~~~~~~~~~~~~~~~~~~~~~^~~
In file included from fs/hpfs/hpfs_fn.h:26,
                 from fs/hpfs/anode.c:10:
fs/hpfs/hpfs.h:412:32: note: while referencing 'internal'
  412 |     struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving
      |                                ^~~~~~~~

drivers/net/can/usb/etas_es58x/es58x_fd.c: In function 'es58x_fd_tx_can_msg':
drivers/net/can/usb/etas_es58x/es58x_fd.c:360:35: warning: array subscript 65535 is outside the bounds of an interior zero-length array 'u8[0]' {aka 'unsigned char[]'} [-Wzero-length-bounds]
  360 |  tx_can_msg = (typeof(tx_can_msg))&es58x_fd_urb_cmd->raw_msg[msg_len];
      |                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from drivers/net/can/usb/etas_es58x/es58x_core.h:22,
                 from drivers/net/can/usb/etas_es58x/es58x_fd.c:17:
drivers/net/can/usb/etas_es58x/es58x_fd.h:231:6: note: while referencing 'raw_msg'
  231 |   u8 raw_msg[0];
      |      ^~~~~~~

Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ayush Sawal <ayush.sawal@chelsio.com>
Cc: Vinay Kumar Yadav <vinay.yadav@chelsio.com>
Cc: Rohit Maheshwari <rohitm@chelsio.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Kalle Valo <kvalo@codeaurora.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Stanislaw Gruszka <stf_xl@wp.pl>
Cc: Luca Coelho <luciano.coelho@intel.com>
Cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Mordechay Goodstein <mordechay.goodstein@intel.com>
Cc: Lee Jones <lee.jones@linaro.org>
Cc: Wolfgang Grandegger <wg@grandegger.com>
Cc: Marc Kleine-Budde <mkl@pengutronix.de>
Cc: Arunachalam Santhanam <arunachalam.santhanam@in.bosch.com>
Cc: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Cc: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Cc: linux-crypto@vger.kernel.org
Cc: ath10k@lists.infradead.org
Cc: linux-wireless@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: linux-can@vger.kernel.org
Cc: bpf@vger.kernel.org
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de> # drivers/net/can/usb/etas_es58x/*
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/filter.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4a93c12543ee..4298c5e428a3 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -586,8 +586,10 @@ struct bpf_prog {
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	/* Instructions for interpreter */
-	struct sock_filter	insns[0];
-	struct bpf_insn		insnsi[];
+	union {
+		DECLARE_FLEX_ARRAY(struct sock_filter, insns);
+		DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi);
+	};
 };
 
 struct sk_filter {
-- 
cgit v1.2.3


From 47c662486cccf03e7062139d069b07ab0126ef59 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 13 Aug 2021 12:19:24 -0700
Subject: treewide: Replace 0-element memcpy() destinations with flexible
 arrays

The 0-element arrays that are used as memcpy() destinations are actually
flexible arrays. Adjust their structures accordingly so that memcpy()
can better reason able their destination size (i.e. they need to be seen
as "unknown" length rather than "zero").

In some cases, use of the DECLARE_FLEX_ARRAY() helper is needed when a
flexible array is alone in a struct.

Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kalle Valo <kvalo@codeaurora.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Nilesh Javali <njavali@marvell.com>
Cc: Manish Rangankar <mrangankar@marvell.com>
Cc: GR-QLogic-Storage-Upstream@marvell.com
Cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Florian Schilhabel <florian.c.schilhabel@googlemail.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Fabio Aiuto <fabioaiuto83@gmail.com>
Cc: Ross Schmidt <ross.schm.dev@gmail.com>
Cc: Marco Cesati <marcocesati@gmail.com>
Cc: ath10k@lists.infradead.org
Cc: linux-wireless@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: linux-staging@lists.linux.dev
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/ieee80211.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 694264503119..ada3dd79cd08 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1143,7 +1143,7 @@ struct ieee80211_mgmt {
 			__le16 auth_transaction;
 			__le16 status_code;
 			/* possibly followed by Challenge text */
-			u8 variable[0];
+			u8 variable[];
 		} __packed auth;
 		struct {
 			__le16 reason_code;
@@ -1152,26 +1152,26 @@ struct ieee80211_mgmt {
 			__le16 capab_info;
 			__le16 listen_interval;
 			/* followed by SSID and Supported rates */
-			u8 variable[0];
+			u8 variable[];
 		} __packed assoc_req;
 		struct {
 			__le16 capab_info;
 			__le16 status_code;
 			__le16 aid;
 			/* followed by Supported rates */
-			u8 variable[0];
+			u8 variable[];
 		} __packed assoc_resp, reassoc_resp;
 		struct {
 			__le16 capab_info;
 			__le16 status_code;
-			u8 variable[0];
+			u8 variable[];
 		} __packed s1g_assoc_resp, s1g_reassoc_resp;
 		struct {
 			__le16 capab_info;
 			__le16 listen_interval;
 			u8 current_ap[ETH_ALEN];
 			/* followed by SSID and Supported rates */
-			u8 variable[0];
+			u8 variable[];
 		} __packed reassoc_req;
 		struct {
 			__le16 reason_code;
@@ -1182,11 +1182,11 @@ struct ieee80211_mgmt {
 			__le16 capab_info;
 			/* followed by some of SSID, Supported rates,
 			 * FH Params, DS Params, CF Params, IBSS Params, TIM */
-			u8 variable[0];
+			u8 variable[];
 		} __packed beacon;
 		struct {
 			/* only variable items: SSID, Supported rates */
-			u8 variable[0];
+			DECLARE_FLEX_ARRAY(u8, variable);
 		} __packed probe_req;
 		struct {
 			__le64 timestamp;
@@ -1194,7 +1194,7 @@ struct ieee80211_mgmt {
 			__le16 capab_info;
 			/* followed by some of SSID, Supported rates,
 			 * FH Params, DS Params, CF Params, IBSS Params */
-			u8 variable[0];
+			u8 variable[];
 		} __packed probe_resp;
 		struct {
 			u8 category;
@@ -1203,16 +1203,16 @@ struct ieee80211_mgmt {
 					u8 action_code;
 					u8 dialog_token;
 					u8 status_code;
-					u8 variable[0];
+					u8 variable[];
 				} __packed wme_action;
 				struct{
 					u8 action_code;
-					u8 variable[0];
+					u8 variable[];
 				} __packed chan_switch;
 				struct{
 					u8 action_code;
 					struct ieee80211_ext_chansw_ie data;
-					u8 variable[0];
+					u8 variable[];
 				} __packed ext_chan_switch;
 				struct{
 					u8 action_code;
@@ -1228,7 +1228,7 @@ struct ieee80211_mgmt {
 					__le16 timeout;
 					__le16 start_seq_num;
 					/* followed by BA Extension */
-					u8 variable[0];
+					u8 variable[];
 				} __packed addba_req;
 				struct{
 					u8 action_code;
@@ -1244,11 +1244,11 @@ struct ieee80211_mgmt {
 				} __packed delba;
 				struct {
 					u8 action_code;
-					u8 variable[0];
+					u8 variable[];
 				} __packed self_prot;
 				struct{
 					u8 action_code;
-					u8 variable[0];
+					u8 variable[];
 				} __packed mesh_action;
 				struct {
 					u8 action;
@@ -1292,7 +1292,7 @@ struct ieee80211_mgmt {
 					u8 toa[6];
 					__le16 tod_error;
 					__le16 toa_error;
-					u8 variable[0];
+					u8 variable[];
 				} __packed ftm;
 				struct {
 					u8 action_code;
-- 
cgit v1.2.3


From afd7de03c5268f74202c1dd4780a8532a11f4c6b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Oct 2021 08:53:19 -0600
Subject: block: remove some blk_mq_hw_ctx debugfs entries

Just like the blk_mq_ctx counterparts, we've got a bunch of counters
in here that are only for debugfs and are of questionnable value. They
are:

- dispatched, index of how many requests were dispatched in one go

- poll_{considered,invoked,success}, which track poll sucess rates. We're
  confident in the iopoll implementation at this point, don't bother
  tracking these.

As a bonus, this shrinks each hardware queue from 576 bytes to 512 bytes,
dropping a whole cacheline.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 95c3bd3a008e..9fb8618fb957 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -341,9 +341,6 @@ struct blk_mq_hw_ctx {
 	unsigned long		queued;
 	/** @run: Number of dispatched requests. */
 	unsigned long		run;
-#define BLK_MQ_MAX_DISPATCH_ORDER	7
-	/** @dispatched: Number of dispatch requests by queue. */
-	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
 	/** @numa_node: NUMA node the storage adapter has been connected to. */
 	unsigned int		numa_node;
@@ -363,13 +360,6 @@ struct blk_mq_hw_ctx {
 	/** @kobj: Kernel object for sysfs. */
 	struct kobject		kobj;
 
-	/** @poll_considered: Count times blk_mq_poll() was called. */
-	unsigned long		poll_considered;
-	/** @poll_invoked: Count how many requests blk_mq_poll() polled. */
-	unsigned long		poll_invoked;
-	/** @poll_success: Count how many polled requests were completed. */
-	unsigned long		poll_success;
-
 #ifdef CONFIG_BLK_DEBUG_FS
 	/**
 	 * @debugfs_dir: debugfs directory for this hardware queue. Named
-- 
cgit v1.2.3


From 013a7f95438144f4ab39a1017a0bff2765d2551a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 13 Oct 2021 07:58:52 -0600
Subject: block: provide helpers for rq_list manipulation

Instead of open-coding the list additions, traversal, and removal,
provide a basic set of helpers.

Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d5b21fc8f49e..b0a322172965 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1298,4 +1298,33 @@ int fsync_bdev(struct block_device *bdev);
 int freeze_bdev(struct block_device *bdev);
 int thaw_bdev(struct block_device *bdev);
 
+#define rq_list_add(listptr, rq)	do {		\
+	(rq)->rq_next = *(listptr);			\
+	*(listptr) = rq;				\
+} while (0)
+
+#define rq_list_pop(listptr)				\
+({							\
+	struct request *__req = NULL;			\
+	if ((listptr) && *(listptr))	{		\
+		__req = *(listptr);			\
+		*(listptr) = __req->rq_next;		\
+	}						\
+	__req;						\
+})
+
+#define rq_list_peek(listptr)				\
+({							\
+	struct request *__req = NULL;			\
+	if ((listptr) && *(listptr))			\
+		__req = *(listptr);			\
+	__req;						\
+})
+
+#define rq_list_for_each(listptr, pos)			\
+	for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) \
+
+#define rq_list_next(rq)	(rq)->rq_next
+#define rq_list_empty(list)	((list) == (struct request *) NULL)
+
 #endif /* _LINUX_BLKDEV_H */
-- 
cgit v1.2.3


From 5a72e899ceb465d731c413d57c6c12cdbf88303c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 12 Oct 2021 09:24:29 -0600
Subject: block: add a struct io_comp_batch argument to fops->iopoll()

struct io_comp_batch contains a list head and a completion handler, which
will allow completions to more effciently completed batches of IO.

For now, no functional changes in this patch, we just define the
io_comp_batch structure and add the argument to the file_operations iopoll
handler.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h |  2 +-
 include/linux/blkdev.h | 13 +++++++++++--
 include/linux/fs.h     |  4 +++-
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9fb8618fb957..4c79439af2f2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -532,7 +532,7 @@ struct blk_mq_ops {
 	/**
 	 * @poll: Called to poll for completion of a specific tag.
 	 */
-	int (*poll)(struct blk_mq_hw_ctx *);
+	int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);
 
 	/**
 	 * @complete: Mark the request as complete.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b0a322172965..fd9771a1da09 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -569,8 +569,9 @@ blk_status_t errno_to_blk_status(int errno);
 #define BLK_POLL_ONESHOT		(1 << 0)
 /* do not sleep to wait for the expected completion time */
 #define BLK_POLL_NOSLEEP		(1 << 1)
-int bio_poll(struct bio *bio, unsigned int flags);
-int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags);
+int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
+			unsigned int flags);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
@@ -1298,6 +1299,14 @@ int fsync_bdev(struct block_device *bdev);
 int freeze_bdev(struct block_device *bdev);
 int thaw_bdev(struct block_device *bdev);
 
+struct io_comp_batch {
+	struct request *req_list;
+	bool need_ts;
+	void (*complete)(struct io_comp_batch *);
+};
+
+#define DEFINE_IO_COMP_BATCH(name)	struct io_comp_batch name = { }
+
 #define rq_list_add(listptr, rq)	do {		\
 	(rq)->rq_next = *(listptr);			\
 	*(listptr) = rq;				\
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f595f4097cb7..31029a91f440 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -48,6 +48,7 @@
 struct backing_dev_info;
 struct bdi_writeback;
 struct bio;
+struct io_comp_batch;
 struct export_operations;
 struct fiemap_extent_info;
 struct hd_geometry;
@@ -2071,7 +2072,8 @@ struct file_operations {
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
-	int (*iopoll)(struct kiocb *kiocb, unsigned int flags);
+	int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
+			unsigned int flags);
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
-- 
cgit v1.2.3


From 1aec5e4a2962f7e0b3fb3e7308dd726be2472c26 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 8 Oct 2021 05:44:23 -0600
Subject: sbitmap: add helper to clear a batch of tags

sbitmap currently only supports clearing tags one-by-one, add a helper
that allows the caller to pass in an array of tags to clear.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index e30b56023ead..4a6ff274335a 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -528,6 +528,17 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu);
 
+/**
+ * sbitmap_queue_clear_batch() - Free a batch of allocated bits
+ * &struct sbitmap_queue.
+ * @sbq: Bitmap to free from.
+ * @offset: offset for each tag in array
+ * @tags: array of tags
+ * @nr_tags: number of tags in array
+ */
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+				int *tags, int nr_tags);
+
 static inline int sbq_index_inc(int index)
 {
 	return (index + 1) & (SBQ_WAIT_QUEUES - 1);
-- 
cgit v1.2.3


From f794f3351f2672d782b8df0fa59f3cef38cffa59 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 8 Oct 2021 05:50:46 -0600
Subject: block: add support for blk_mq_end_request_batch()

Instead of calling blk_mq_end_request() on a single request, add a helper
that takes the new struct io_comp_batch and completes any request stored
in there.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4c79439af2f2..656fe34bdb6c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -728,6 +728,35 @@ static inline void blk_mq_set_request_complete(struct request *rq)
 void blk_mq_start_request(struct request *rq);
 void blk_mq_end_request(struct request *rq, blk_status_t error);
 void __blk_mq_end_request(struct request *rq, blk_status_t error);
+void blk_mq_end_request_batch(struct io_comp_batch *ib);
+
+/*
+ * Only need start/end time stamping if we have iostat or
+ * blk stats enabled, or using an IO scheduler.
+ */
+static inline bool blk_mq_need_time_stamp(struct request *rq)
+{
+	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
+}
+
+/*
+ * Batched completions only work when there is no I/O error and no special
+ * ->end_io handler.
+ */
+static inline bool blk_mq_add_to_batch(struct request *req,
+				       struct io_comp_batch *iob, int ioerror,
+				       void (*complete)(struct io_comp_batch *))
+{
+	if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror)
+		return false;
+	if (!iob->complete)
+		iob->complete = complete;
+	else if (iob->complete != complete)
+		return false;
+	iob->need_ts |= blk_mq_need_time_stamp(req);
+	rq_list_add(&iob->req_list, req);
+	return true;
+}
 
 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q);
-- 
cgit v1.2.3


From 99457db8b40c66941072a383a5ab4e36bc53fd3d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 Oct 2021 12:11:01 +0200
Subject: block: move the SECTOR_SIZE related definitions to blk_types.h

Ensure these are always available for inlines in the various block layer
headers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20211018101130.1838532-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 17 +++++++++++++++++
 include/linux/blkdev.h    | 17 -----------------
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1e370929c89e..472e55e0e94f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -20,6 +20,23 @@ struct cgroup_subsys_state;
 typedef void (bio_end_io_t) (struct bio *);
 struct bio_crypt_ctx;
 
+/*
+ * The basic unit of block I/O is a sector. It is used in a number of contexts
+ * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9
+ * bytes. Variables of type sector_t represent an offset or size that is a
+ * multiple of 512 bytes. Hence these two constants.
+ */
+#ifndef SECTOR_SHIFT
+#define SECTOR_SHIFT 9
+#endif
+#ifndef SECTOR_SIZE
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
+#endif
+
+#define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
+#define SECTOR_MASK		(PAGE_SECTORS - 1)
+
 struct block_device {
 	sector_t		bd_start_sect;
 	struct disk_stats __percpu *bd_stats;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fd9771a1da09..abe721591e80 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -578,23 +578,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 	return bdev->bd_queue;	/* this is never NULL */
 }
 
-/*
- * The basic unit of block I/O is a sector. It is used in a number of contexts
- * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9
- * bytes. Variables of type sector_t represent an offset or size that is a
- * multiple of 512 bytes. Hence these two constants.
- */
-#ifndef SECTOR_SHIFT
-#define SECTOR_SHIFT 9
-#endif
-#ifndef SECTOR_SIZE
-#define SECTOR_SIZE (1 << SECTOR_SHIFT)
-#endif
-
-#define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
-#define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
-#define SECTOR_MASK		(PAGE_SECTORS - 1)
-
 #ifdef CONFIG_BLK_DEV_ZONED
 
 /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
-- 
cgit v1.2.3


From 6436bd90f76e75d2c5786a50203b05a9b7f7100d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 Oct 2021 12:11:02 +0200
Subject: block: add a bdev_nr_bytes helper

Add a helper to query the size of a block device in bytes.  This
will be used to remove open coded access to ->bd_inode.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20211018101130.1838532-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index cd4038fd5743..01d27f3a970e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -236,9 +236,14 @@ static inline sector_t get_start_sect(struct block_device *bdev)
 	return bdev->bd_start_sect;
 }
 
+static inline loff_t bdev_nr_bytes(struct block_device *bdev)
+{
+	return i_size_read(bdev->bd_inode);
+}
+
 static inline sector_t bdev_nr_sectors(struct block_device *bdev)
 {
-	return i_size_read(bdev->bd_inode) >> 9;
+	return bdev_nr_bytes(bdev) >> SECTOR_SHIFT;
 }
 
 static inline sector_t get_capacity(struct gendisk *disk)
-- 
cgit v1.2.3


From bcc6e2cfaa48a4ad2e17719194f6d97a8e03f6c1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 Oct 2021 12:11:25 +0200
Subject: block: add a sb_bdev_nr_blocks helper

Add a helper to return the size of sb->s_bdev in sb->s_blocksize_bits
based unites.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20211018101130.1838532-26-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 01d27f3a970e..7b0326661a1e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -251,6 +251,12 @@ static inline sector_t get_capacity(struct gendisk *disk)
 	return bdev_nr_sectors(disk->part0);
 }
 
+static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
+{
+	return bdev_nr_sectors(sb->s_bdev) >>
+		(sb->s_blocksize_bits - SECTOR_SHIFT);
+}
+
 int bdev_disk_changed(struct gendisk *disk, bool invalidate);
 void blk_drop_partitions(struct gendisk *disk);
 
-- 
cgit v1.2.3


From f09313c57a17683cbcb305989daf1d94b49fd32c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Oct 2021 11:39:45 -0600
Subject: block: cache inode size in bdev

Reading the inode size brings in a new cacheline for IO submit, and
it's in the hot path being checked for every single IO. When doing
millions of IOs per core per second, this is noticeable overhead.

Cache the nr_sectors in the bdev itself.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 1 +
 include/linux/genhd.h     | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 472e55e0e94f..fe065c394fff 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -39,6 +39,7 @@ struct bio_crypt_ctx;
 
 struct block_device {
 	sector_t		bd_start_sect;
+	sector_t		bd_nr_sectors;
 	struct disk_stats __percpu *bd_stats;
 	unsigned long		bd_stamp;
 	bool			bd_read_only;	/* read-only policy */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7b0326661a1e..900325aa5d31 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -236,14 +236,14 @@ static inline sector_t get_start_sect(struct block_device *bdev)
 	return bdev->bd_start_sect;
 }
 
-static inline loff_t bdev_nr_bytes(struct block_device *bdev)
+static inline sector_t bdev_nr_sectors(struct block_device *bdev)
 {
-	return i_size_read(bdev->bd_inode);
+	return bdev->bd_nr_sectors;
 }
 
-static inline sector_t bdev_nr_sectors(struct block_device *bdev)
+static inline loff_t bdev_nr_bytes(struct block_device *bdev)
 {
-	return bdev_nr_bytes(bdev) >> SECTOR_SHIFT;
+	return bdev_nr_sectors(bdev) << SECTOR_SHIFT;
 }
 
 static inline sector_t get_capacity(struct gendisk *disk)
-- 
cgit v1.2.3


From 425a563acb1d1872c0036fbcb644247640edbbc6 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Sun, 23 May 2021 13:34:28 +0300
Subject: net/mlx5: Introduce port selection namespace

Add new port selection flow steering namespace. Flow steering rules in
this namespaceare are used to determine the physical port for egress
packets.

Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h   | 15 +++++++++++++++
 include/linux/mlx5/fs.h       |  1 +
 include/linux/mlx5/mlx5_ifc.h | 25 +++++++++++++++++++++++--
 3 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 109cc8106d16..347167c18802 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1185,6 +1185,7 @@ enum mlx5_cap_type {
 	MLX5_CAP_DEV_EVENT = 0x14,
 	MLX5_CAP_IPSEC,
 	MLX5_CAP_GENERAL_2 = 0x20,
+	MLX5_CAP_PORT_SELECTION = 0x25,
 	/* NUM OF CAP Types */
 	MLX5_CAP_NUM
 };
@@ -1342,6 +1343,20 @@ enum mlx5_qcam_feature_groups {
 	MLX5_GET(e_switch_cap, \
 		 mdev->caps.hca[MLX5_CAP_ESWITCH]->max, cap)
 
+#define MLX5_CAP_PORT_SELECTION(mdev, cap) \
+	MLX5_GET(port_selection_cap, \
+		 mdev->caps.hca[MLX5_CAP_PORT_SELECTION]->cur, cap)
+
+#define MLX5_CAP_PORT_SELECTION_MAX(mdev, cap) \
+	MLX5_GET(port_selection_cap, \
+		 mdev->caps.hca[MLX5_CAP_PORT_SELECTION]->max, cap)
+
+#define MLX5_CAP_FLOWTABLE_PORT_SELECTION(mdev, cap) \
+	MLX5_CAP_PORT_SELECTION(mdev, flow_table_properties_port_selection.cap)
+
+#define MLX5_CAP_FLOWTABLE_PORT_SELECTION_MAX(mdev, cap) \
+	MLX5_CAP_PORT_SELECTION_MAX(mdev, flow_table_properties_port_selection.cap)
+
 #define MLX5_CAP_ODP(mdev, cap)\
 	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, cap)
 
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 0106c67e8ccb..259fcc168340 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -83,6 +83,7 @@ enum mlx5_flow_namespace_type {
 	MLX5_FLOW_NAMESPACE_RDMA_RX,
 	MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL,
 	MLX5_FLOW_NAMESPACE_RDMA_TX,
+	MLX5_FLOW_NAMESPACE_PORT_SEL,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c614ad1da44d..db1d9c69c1fa 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -767,6 +767,18 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 	u8         reserved_at_20c0[0x5f40];
 };
 
+struct mlx5_ifc_port_selection_cap_bits {
+	u8         reserved_at_0[0x10];
+	u8         port_select_flow_table[0x1];
+	u8         reserved_at_11[0xf];
+
+	u8         reserved_at_20[0x1e0];
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_port_selection;
+
+	u8         reserved_at_400[0x7c00];
+};
+
 enum {
 	MLX5_FDB_TO_VPORT_REG_C_0 = 0x01,
 	MLX5_FDB_TO_VPORT_REG_C_1 = 0x02,
@@ -1515,7 +1527,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         uar_4k[0x1];
 	u8         reserved_at_241[0x9];
 	u8         uar_sz[0x6];
-	u8         reserved_at_248[0x2];
+	u8         port_selection_cap[0x1];
+	u8         reserved_at_248[0x1];
 	u8         umem_uid_0[0x1];
 	u8         reserved_at_250[0x5];
 	u8         log_pg_sz[0x8];
@@ -3164,6 +3177,7 @@ union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap;
 	struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap;
 	struct mlx5_ifc_e_switch_cap_bits e_switch_cap;
+	struct mlx5_ifc_port_selection_cap_bits port_selection_cap;
 	struct mlx5_ifc_vector_calc_cap_bits vector_calc_cap;
 	struct mlx5_ifc_qos_cap_bits qos_cap;
 	struct mlx5_ifc_debug_cap_bits debug_cap;
@@ -10434,9 +10448,16 @@ struct mlx5_ifc_dcbx_param_bits {
 	u8         reserved_at_a0[0x160];
 };
 
+enum {
+	MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY = 0,
+	MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT,
+};
+
 struct mlx5_ifc_lagc_bits {
 	u8         fdb_selection_mode[0x1];
-	u8         reserved_at_1[0x1c];
+	u8         reserved_at_1[0x14];
+	u8         port_select_mode[0x3];
+	u8         reserved_at_18[0x5];
 	u8         lag_state[0x3];
 
 	u8         reserved_at_20[0x14];
-- 
cgit v1.2.3


From e7e2519e3632396a25031b7e828ed35332e5dd07 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Tue, 6 Jul 2021 17:48:26 +0300
Subject: net/mlx5: Add support to create match definer

Introduce new APIs to create and destroy flow matcher
for given format id.

Flow match definer object is used for defining the fields and
mask used for the hash calculation. User should mask the desired
fields like done in the match criteria.

This object is assigned to flow group of type hash. In this flow
group type, packets lookup is done based on the hash result.

This patch also adds the required bits to create such flow group.

Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/fs.h       |   8 ++
 include/linux/mlx5/mlx5_ifc.h | 272 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 253 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 259fcc168340..7a43fec63a35 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -98,6 +98,7 @@ enum {
 
 struct mlx5_pkt_reformat;
 struct mlx5_modify_hdr;
+struct mlx5_flow_definer;
 struct mlx5_flow_table;
 struct mlx5_flow_group;
 struct mlx5_flow_namespace;
@@ -258,6 +259,13 @@ struct mlx5_modify_hdr *mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
 						 void *modify_actions);
 void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev,
 				struct mlx5_modify_hdr *modify_hdr);
+struct mlx5_flow_definer *
+mlx5_create_match_definer(struct mlx5_core_dev *dev,
+			  enum mlx5_flow_namespace_type ns_type, u16 format_id,
+			  u32 *match_mask);
+void mlx5_destroy_match_definer(struct mlx5_core_dev *dev,
+				struct mlx5_flow_definer *definer);
+int mlx5_get_match_definer_id(struct mlx5_flow_definer *definer);
 
 struct mlx5_pkt_reformat_params {
 	int type;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index db1d9c69c1fa..8f41145bc6ef 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -94,6 +94,7 @@ enum {
 enum {
 	MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b,
 	MLX5_OBJ_TYPE_VIRTIO_NET_Q = 0x000d,
+	MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018,
 	MLX5_OBJ_TYPE_MKEY = 0xff01,
 	MLX5_OBJ_TYPE_QP = 0xff02,
 	MLX5_OBJ_TYPE_PSV = 0xff03,
@@ -1731,7 +1732,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         flex_parser_id_outer_first_mpls_over_gre[0x4];
 	u8         flex_parser_id_outer_first_mpls_over_udp_label[0x4];
 
-	u8	   reserved_at_6e0[0x10];
+	u8         max_num_match_definer[0x10];
 	u8	   sf_base_id[0x10];
 
 	u8         flex_parser_id_gtpu_dw_2[0x4];
@@ -1746,7 +1747,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8	   reserved_at_760[0x20];
 	u8	   vhca_tunnel_commands[0x40];
-	u8	   reserved_at_7c0[0x40];
+	u8         match_definer_format_supported[0x40];
 };
 
 struct mlx5_ifc_cmd_hca_cap_2_bits {
@@ -5666,6 +5667,236 @@ struct mlx5_ifc_query_fte_in_bits {
 	u8         reserved_at_120[0xe0];
 };
 
+struct mlx5_ifc_match_definer_format_0_bits {
+	u8         reserved_at_0[0x100];
+
+	u8         metadata_reg_c_0[0x20];
+
+	u8         metadata_reg_c_1[0x20];
+
+	u8         outer_dmac_47_16[0x20];
+
+	u8         outer_dmac_15_0[0x10];
+	u8         outer_ethertype[0x10];
+
+	u8         reserved_at_180[0x1];
+	u8         sx_sniffer[0x1];
+	u8         functional_lb[0x1];
+	u8         outer_ip_frag[0x1];
+	u8         outer_qp_type[0x2];
+	u8         outer_encap_type[0x2];
+	u8         port_number[0x2];
+	u8         outer_l3_type[0x2];
+	u8         outer_l4_type[0x2];
+	u8         outer_first_vlan_type[0x2];
+	u8         outer_first_vlan_prio[0x3];
+	u8         outer_first_vlan_cfi[0x1];
+	u8         outer_first_vlan_vid[0xc];
+
+	u8         outer_l4_type_ext[0x4];
+	u8         reserved_at_1a4[0x2];
+	u8         outer_ipsec_layer[0x2];
+	u8         outer_l2_type[0x2];
+	u8         force_lb[0x1];
+	u8         outer_l2_ok[0x1];
+	u8         outer_l3_ok[0x1];
+	u8         outer_l4_ok[0x1];
+	u8         outer_second_vlan_type[0x2];
+	u8         outer_second_vlan_prio[0x3];
+	u8         outer_second_vlan_cfi[0x1];
+	u8         outer_second_vlan_vid[0xc];
+
+	u8         outer_smac_47_16[0x20];
+
+	u8         outer_smac_15_0[0x10];
+	u8         inner_ipv4_checksum_ok[0x1];
+	u8         inner_l4_checksum_ok[0x1];
+	u8         outer_ipv4_checksum_ok[0x1];
+	u8         outer_l4_checksum_ok[0x1];
+	u8         inner_l3_ok[0x1];
+	u8         inner_l4_ok[0x1];
+	u8         outer_l3_ok_duplicate[0x1];
+	u8         outer_l4_ok_duplicate[0x1];
+	u8         outer_tcp_cwr[0x1];
+	u8         outer_tcp_ece[0x1];
+	u8         outer_tcp_urg[0x1];
+	u8         outer_tcp_ack[0x1];
+	u8         outer_tcp_psh[0x1];
+	u8         outer_tcp_rst[0x1];
+	u8         outer_tcp_syn[0x1];
+	u8         outer_tcp_fin[0x1];
+};
+
+struct mlx5_ifc_match_definer_format_22_bits {
+	u8         reserved_at_0[0x100];
+
+	u8         outer_ip_src_addr[0x20];
+
+	u8         outer_ip_dest_addr[0x20];
+
+	u8         outer_l4_sport[0x10];
+	u8         outer_l4_dport[0x10];
+
+	u8         reserved_at_160[0x1];
+	u8         sx_sniffer[0x1];
+	u8         functional_lb[0x1];
+	u8         outer_ip_frag[0x1];
+	u8         outer_qp_type[0x2];
+	u8         outer_encap_type[0x2];
+	u8         port_number[0x2];
+	u8         outer_l3_type[0x2];
+	u8         outer_l4_type[0x2];
+	u8         outer_first_vlan_type[0x2];
+	u8         outer_first_vlan_prio[0x3];
+	u8         outer_first_vlan_cfi[0x1];
+	u8         outer_first_vlan_vid[0xc];
+
+	u8         metadata_reg_c_0[0x20];
+
+	u8         outer_dmac_47_16[0x20];
+
+	u8         outer_smac_47_16[0x20];
+
+	u8         outer_smac_15_0[0x10];
+	u8         outer_dmac_15_0[0x10];
+};
+
+struct mlx5_ifc_match_definer_format_23_bits {
+	u8         reserved_at_0[0x100];
+
+	u8         inner_ip_src_addr[0x20];
+
+	u8         inner_ip_dest_addr[0x20];
+
+	u8         inner_l4_sport[0x10];
+	u8         inner_l4_dport[0x10];
+
+	u8         reserved_at_160[0x1];
+	u8         sx_sniffer[0x1];
+	u8         functional_lb[0x1];
+	u8         inner_ip_frag[0x1];
+	u8         inner_qp_type[0x2];
+	u8         inner_encap_type[0x2];
+	u8         port_number[0x2];
+	u8         inner_l3_type[0x2];
+	u8         inner_l4_type[0x2];
+	u8         inner_first_vlan_type[0x2];
+	u8         inner_first_vlan_prio[0x3];
+	u8         inner_first_vlan_cfi[0x1];
+	u8         inner_first_vlan_vid[0xc];
+
+	u8         tunnel_header_0[0x20];
+
+	u8         inner_dmac_47_16[0x20];
+
+	u8         inner_smac_47_16[0x20];
+
+	u8         inner_smac_15_0[0x10];
+	u8         inner_dmac_15_0[0x10];
+};
+
+struct mlx5_ifc_match_definer_format_29_bits {
+	u8         reserved_at_0[0xc0];
+
+	u8         outer_ip_dest_addr[0x80];
+
+	u8         outer_ip_src_addr[0x80];
+
+	u8         outer_l4_sport[0x10];
+	u8         outer_l4_dport[0x10];
+
+	u8         reserved_at_1e0[0x20];
+};
+
+struct mlx5_ifc_match_definer_format_30_bits {
+	u8         reserved_at_0[0xa0];
+
+	u8         outer_ip_dest_addr[0x80];
+
+	u8         outer_ip_src_addr[0x80];
+
+	u8         outer_dmac_47_16[0x20];
+
+	u8         outer_smac_47_16[0x20];
+
+	u8         outer_smac_15_0[0x10];
+	u8         outer_dmac_15_0[0x10];
+};
+
+struct mlx5_ifc_match_definer_format_31_bits {
+	u8         reserved_at_0[0xc0];
+
+	u8         inner_ip_dest_addr[0x80];
+
+	u8         inner_ip_src_addr[0x80];
+
+	u8         inner_l4_sport[0x10];
+	u8         inner_l4_dport[0x10];
+
+	u8         reserved_at_1e0[0x20];
+};
+
+struct mlx5_ifc_match_definer_format_32_bits {
+	u8         reserved_at_0[0xa0];
+
+	u8         inner_ip_dest_addr[0x80];
+
+	u8         inner_ip_src_addr[0x80];
+
+	u8         inner_dmac_47_16[0x20];
+
+	u8         inner_smac_47_16[0x20];
+
+	u8         inner_smac_15_0[0x10];
+	u8         inner_dmac_15_0[0x10];
+};
+
+struct mlx5_ifc_match_definer_bits {
+	u8         modify_field_select[0x40];
+
+	u8         reserved_at_40[0x40];
+
+	u8         reserved_at_80[0x10];
+	u8         format_id[0x10];
+
+	u8         reserved_at_a0[0x160];
+
+	u8         match_mask[16][0x20];
+};
+
+struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         vhca_tunnel_id[0x10];
+	u8         obj_type[0x10];
+
+	u8         obj_id[0x20];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         obj_id[0x20];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_match_definer_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+
+	struct mlx5_ifc_match_definer_bits obj_context;
+};
+
+struct mlx5_ifc_create_match_definer_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+};
+
 enum {
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_OUTER_HEADERS    = 0x0,
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS  = 0x1,
@@ -8139,6 +8370,11 @@ struct mlx5_ifc_create_flow_group_out_bits {
 	u8         reserved_at_60[0x20];
 };
 
+enum {
+	MLX5_CREATE_FLOW_GROUP_IN_GROUP_TYPE_TCAM_SUBTABLE  = 0x0,
+	MLX5_CREATE_FLOW_GROUP_IN_GROUP_TYPE_HASH_SPLIT     = 0x1,
+};
+
 enum {
 	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS     = 0x0,
 	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS   = 0x1,
@@ -8160,7 +8396,9 @@ struct mlx5_ifc_create_flow_group_in_bits {
 	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
-	u8         reserved_at_88[0x18];
+	u8         reserved_at_88[0x4];
+	u8         group_type[0x4];
+	u8         reserved_at_90[0x10];
 
 	u8         reserved_at_a0[0x8];
 	u8         table_id[0x18];
@@ -8175,7 +8413,10 @@ struct mlx5_ifc_create_flow_group_in_bits {
 
 	u8         end_flow_index[0x20];
 
-	u8         reserved_at_140[0xa0];
+	u8         reserved_at_140[0x10];
+	u8         match_definer_id[0x10];
+
+	u8         reserved_at_160[0x80];
 
 	u8         reserved_at_1e0[0x18];
 	u8         match_criteria_enable[0x8];
@@ -10671,29 +10912,6 @@ struct mlx5_ifc_dealloc_memic_out_bits {
 	u8         reserved_at_40[0x40];
 };
 
-struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
-	u8         opcode[0x10];
-	u8         uid[0x10];
-
-	u8         vhca_tunnel_id[0x10];
-	u8         obj_type[0x10];
-
-	u8         obj_id[0x20];
-
-	u8         reserved_at_60[0x20];
-};
-
-struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
-	u8         status[0x8];
-	u8         reserved_at_8[0x18];
-
-	u8         syndrome[0x20];
-
-	u8         obj_id[0x20];
-
-	u8         reserved_at_60[0x20];
-};
-
 struct mlx5_ifc_umem_bits {
 	u8         reserved_at_0[0x80];
 
-- 
cgit v1.2.3


From 58a606dba708f114e7f412b5c3024027e8a73e34 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Tue, 3 Aug 2021 10:04:41 +0300
Subject: net/mlx5: Introduce new uplink destination type

The uplink destination type should be used in rules to steer the
packet to the uplink when the device is in steering based LAG mode.

Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 8f41145bc6ef..09e43019d877 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1766,6 +1766,7 @@ enum mlx5_flow_destination_type {
 	MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE   = 0x1,
 	MLX5_FLOW_DESTINATION_TYPE_TIR          = 0x2,
 	MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6,
+	MLX5_FLOW_DESTINATION_TYPE_UPLINK       = 0x8,
 
 	MLX5_FLOW_DESTINATION_TYPE_PORT         = 0x99,
 	MLX5_FLOW_DESTINATION_TYPE_COUNTER      = 0x100,
-- 
cgit v1.2.3


From 25c02edfd41f0dd7aad9115149625d7e7f441b7d Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <aardelean@deviqon.com>
Date: Fri, 3 Sep 2021 10:29:13 +0300
Subject: iio: inkern: introduce devm_iio_map_array_register() short-hand
 function

This change introduces a device-managed variant to the
iio_map_array_register() function. It's a simple implementation of calling
iio_map_array_register() and registering a callback to
iio_map_array_unregister() with the devm_add_action_or_reset().

The function uses an explicit 'dev' parameter to bind the unwinding to. It
could have been implemented to implicitly use the parent of the IIO device,
however it shouldn't be too expensive to callers to just specify to which
device object to bind this unwind call.
It would make the API a bit more flexible.

Signed-off-by: Alexandru Ardelean <aardelean@deviqon.com>
Link: https://lore.kernel.org/r/20210903072917.45769-2-aardelean@deviqon.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/driver.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/driver.h b/include/linux/iio/driver.h
index 36de60a5da7a..7a157ed218f6 100644
--- a/include/linux/iio/driver.h
+++ b/include/linux/iio/driver.h
@@ -8,6 +8,7 @@
 #ifndef _IIO_INKERN_H_
 #define _IIO_INKERN_H_
 
+struct device;
 struct iio_dev;
 struct iio_map;
 
@@ -26,4 +27,17 @@ int iio_map_array_register(struct iio_dev *indio_dev,
  */
 int iio_map_array_unregister(struct iio_dev *indio_dev);
 
+/**
+ * devm_iio_map_array_register - device-managed version of iio_map_array_register
+ * @dev:	Device object to which to bind the unwinding of this registration
+ * @indio_dev:	Pointer to the iio_dev structure
+ * @maps:	Pointer to an IIO map object which is to be registered to this IIO device
+ *
+ * This function will call iio_map_array_register() to register an IIO map object
+ * and will also hook a callback to the iio_map_array_unregister() function to
+ * handle de-registration of the IIO map object when the device's refcount goes to
+ * zero.
+ */
+int devm_iio_map_array_register(struct device *dev, struct iio_dev *indio_dev, struct iio_map *maps);
+
 #endif
-- 
cgit v1.2.3


From 31fa357ac809affd9f9a7d0b5d1991951e16beec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nuno=20S=C3=A1?= <nuno.sa@analog.com>
Date: Fri, 3 Sep 2021 16:14:20 +0200
Subject: iio: adis: handle devices that cannot unmask the drdy pin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some devices can't mask/unmask the data ready pin and in those cases
each driver was just calling '{dis}enable_irq()' to control the trigger
state. This change, moves that handling into the library by introducing
a new boolean in the data structure that tells the library that the
device cannot unmask the pin.

On top of controlling the trigger state, we can also use this flag to
automatically request the IRQ with 'IRQF_NO_AUTOEN' in case it is set.
So far, all users of the library want to start operation with IRQs/DRDY
pin disabled so it should be fairly safe to do this inside the library.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20210903141423.517028-3-nuno.sa@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/imu/adis.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h
index cf49997d5903..7c02f5292eea 100644
--- a/include/linux/iio/imu/adis.h
+++ b/include/linux/iio/imu/adis.h
@@ -49,6 +49,7 @@ struct adis_timeout {
  * @status_error_mask: Bitmask of errors supported by the device
  * @timeouts: Chip specific delays
  * @enable_irq: Hook for ADIS devices that have a special IRQ enable/disable
+ * @unmasked_drdy: True for devices that cannot mask/unmask the data ready pin
  * @has_paging: True if ADIS device has paged registers
  * @burst_reg_cmd:	Register command that triggers burst
  * @burst_len:		Burst size in the SPI RX buffer. If @burst_max_len is defined,
@@ -78,6 +79,7 @@ struct adis_data {
 	unsigned int status_error_mask;
 
 	int (*enable_irq)(struct adis *adis, bool enable);
+	bool unmasked_drdy;
 
 	bool has_paging;
 
-- 
cgit v1.2.3


From 95ec3fdf2b79eaff79e78688bbc2f7dbb98d68b6 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sun, 13 Jun 2021 16:10:36 +0100
Subject: iio: core: Introduce iio_push_to_buffers_with_ts_unaligned()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Whilst it is almost always possible to arrange for scan data to be
read directly into a buffer that is suitable for passing to
iio_push_to_buffers_with_timestamp(), there are a few places where
leading data needs to be skipped over.

For these cases introduce a function that will allocate an appropriate
sized and aligned bounce buffer (if not already allocated) and copy
the unaligned data into that before calling
iio_push_to_buffers_with_timestamp() on the bounce buffer.
We tie the lifespace of this buffer to that of the iio_dev.dev
which should ensure no memory leaks occur.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20210613151039.569883-2-jic23@kernel.org
---
 include/linux/iio/buffer.h     | 4 ++++
 include/linux/iio/iio-opaque.h | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index b6928ac5c63d..451379a3984a 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -38,6 +38,10 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev,
 	return iio_push_to_buffers(indio_dev, data);
 }
 
+int iio_push_to_buffers_with_ts_unaligned(struct iio_dev *indio_dev,
+					  const void *data, size_t data_sz,
+					  int64_t timestamp);
+
 bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev,
 				   const unsigned long *mask);
 
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index c9504e9da571..2be12b7b5dc5 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -23,6 +23,8 @@
  * @groupcounter:		index of next attribute group
  * @legacy_scan_el_group:	attribute group for legacy scan elements attribute group
  * @legacy_buffer_group:	attribute group for legacy buffer attributes group
+ * @bounce_buffer:		for devices that call iio_push_to_buffers_with_timestamp_unaligned()
+ * @bounce_buffer_size:		size of currently allocate bounce buffer
  * @scan_index_timestamp:	cache of the index to the timestamp
  * @clock_id:			timestamping clock posix identifier
  * @chrdev:			associated character device
@@ -50,6 +52,8 @@ struct iio_dev_opaque {
 	int				groupcounter;
 	struct attribute_group		legacy_scan_el_group;
 	struct attribute_group		legacy_buffer_group;
+	void				*bounce_buffer;
+	size_t				bounce_buffer_size;
 
 	unsigned int			scan_index_timestamp;
 	clockid_t			clock_id;
-- 
cgit v1.2.3


From 9eeee3b0bf190b4f677af27e24ba0cd1c030e49b Mon Sep 17 00:00:00 2001
From: Mihail Chindris <mihail.chindris@analog.com>
Date: Thu, 7 Oct 2021 08:00:30 +0000
Subject: iio: Add output buffer support

Currently IIO only supports buffer mode for capture devices like ADCs. Add
support for buffered mode for output devices like DACs.

The output buffer implementation is analogous to the input buffer
implementation. Instead of using read() to get data from the buffer write()
is used to copy data into the buffer.

poll() with POLLOUT will wakeup if there is space available.

Drivers can remove data from a buffer using iio_pop_from_buffer(), the
function can e.g. called from a trigger handler to write the data to
hardware.

A buffer can only be either a output buffer or an input, but not both. So,
for a device that has an ADC and DAC path, this will mean 2 IIO buffers
(one for each direction).

The direction of the buffer is decided by the new direction field of the
iio_buffer struct and should be set after allocating and before registering
it.

Co-developed-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Co-developed-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Signed-off-by: Mihail Chindris <mihail.chindris@analog.com>
Link: https://lore.kernel.org/r/20211007080035.2531-2-mihail.chindris@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/buffer.h      |  7 +++++++
 include/linux/iio/buffer_impl.h | 11 +++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 451379a3984a..418b1307d3f2 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -11,8 +11,15 @@
 
 struct iio_buffer;
 
+enum iio_buffer_direction {
+	IIO_BUFFER_DIRECTION_IN,
+	IIO_BUFFER_DIRECTION_OUT,
+};
+
 int iio_push_to_buffers(struct iio_dev *indio_dev, const void *data);
 
+int iio_pop_from_buffer(struct iio_buffer *buffer, void *data);
+
 /**
  * iio_push_to_buffers_with_timestamp() - push data and timestamp to buffers
  * @indio_dev:		iio_dev structure for device.
diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h
index 245b32918ae1..e2ca8ea23e19 100644
--- a/include/linux/iio/buffer_impl.h
+++ b/include/linux/iio/buffer_impl.h
@@ -7,6 +7,7 @@
 #ifdef CONFIG_IIO_BUFFER
 
 #include <uapi/linux/iio/buffer.h>
+#include <linux/iio/buffer.h>
 
 struct iio_dev;
 struct iio_buffer;
@@ -23,6 +24,10 @@ struct iio_buffer;
  * @read:		try to get a specified number of bytes (must exist)
  * @data_available:	indicates how much data is available for reading from
  *			the buffer.
+ * @remove_from:	remove scan from buffer. Drivers should calls this to
+ *			remove a scan from a buffer.
+ * @write:		try to write a number of bytes
+ * @space_available:	returns the amount of bytes available in a buffer
  * @request_update:	if a parameter change has been marked, update underlying
  *			storage.
  * @set_bytes_per_datum:set number of bytes per datum
@@ -49,6 +54,9 @@ struct iio_buffer_access_funcs {
 	int (*store_to)(struct iio_buffer *buffer, const void *data);
 	int (*read)(struct iio_buffer *buffer, size_t n, char __user *buf);
 	size_t (*data_available)(struct iio_buffer *buffer);
+	int (*remove_from)(struct iio_buffer *buffer, void *data);
+	int (*write)(struct iio_buffer *buffer, size_t n, const char __user *buf);
+	size_t (*space_available)(struct iio_buffer *buffer);
 
 	int (*request_update)(struct iio_buffer *buffer);
 
@@ -80,6 +88,9 @@ struct iio_buffer {
 	/**  @bytes_per_datum: Size of individual datum including timestamp. */
 	size_t bytes_per_datum;
 
+	/* @direction: Direction of the data stream (in/out). */
+	enum iio_buffer_direction direction;
+
 	/**
 	 * @access: Buffer access functions associated with the
 	 * implementation.
-- 
cgit v1.2.3


From c02cd5c19c17698f12b731e898127095f9bc2921 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Thu, 7 Oct 2021 08:00:32 +0000
Subject: iio: triggered-buffer: extend support to configure output buffers

Now that output (kfifo) buffers are supported, we need to extend the
{devm_}iio_triggered_buffer_setup_ext() parameter list to take a direction
parameter.

This allows us to attach an output triggered buffer to a DAC device.
Unfortunately it's a bit difficult to add another macro to avoid changing 5
drivers where {devm_}iio_triggered_buffer_setup_ext() is used.
Well, it's doable, but may not be worth the trouble vs just updating all
these 5 drivers.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Signed-off-by: Mihail Chindris <mihail.chindris@analog.com>
Link: https://lore.kernel.org/r/20211007080035.2531-4-mihail.chindris@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/triggered_buffer.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/triggered_buffer.h b/include/linux/iio/triggered_buffer.h
index 7f154d1f8739..7490b05fc5b2 100644
--- a/include/linux/iio/triggered_buffer.h
+++ b/include/linux/iio/triggered_buffer.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_IIO_TRIGGERED_BUFFER_H_
 #define _LINUX_IIO_TRIGGERED_BUFFER_H_
 
+#include <linux/iio/buffer.h>
 #include <linux/interrupt.h>
 
 struct attribute;
@@ -11,21 +12,27 @@ struct iio_buffer_setup_ops;
 int iio_triggered_buffer_setup_ext(struct iio_dev *indio_dev,
 	irqreturn_t (*h)(int irq, void *p),
 	irqreturn_t (*thread)(int irq, void *p),
+	enum iio_buffer_direction direction,
 	const struct iio_buffer_setup_ops *setup_ops,
 	const struct attribute **buffer_attrs);
 void iio_triggered_buffer_cleanup(struct iio_dev *indio_dev);
 
 #define iio_triggered_buffer_setup(indio_dev, h, thread, setup_ops)		\
-	iio_triggered_buffer_setup_ext((indio_dev), (h), (thread), (setup_ops), NULL)
+	iio_triggered_buffer_setup_ext((indio_dev), (h), (thread),		\
+					IIO_BUFFER_DIRECTION_IN, (setup_ops),	\
+					NULL)
 
 int devm_iio_triggered_buffer_setup_ext(struct device *dev,
 					struct iio_dev *indio_dev,
 					irqreturn_t (*h)(int irq, void *p),
 					irqreturn_t (*thread)(int irq, void *p),
+					enum iio_buffer_direction direction,
 					const struct iio_buffer_setup_ops *ops,
 					const struct attribute **buffer_attrs);
 
 #define devm_iio_triggered_buffer_setup(dev, indio_dev, h, thread, setup_ops)	\
-	devm_iio_triggered_buffer_setup_ext((dev), (indio_dev), (h), (thread), (setup_ops), NULL)
+	devm_iio_triggered_buffer_setup_ext((dev), (indio_dev), (h), (thread),	\
+					    IIO_BUFFER_DIRECTION_IN,		\
+					    (setup_ops), NULL)
 
 #endif
-- 
cgit v1.2.3


From aafa1cafedca7c64d6a43cd21488d28d1d1be884 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@nvidia.com>
Date: Sat, 2 Oct 2021 12:32:30 +0300
Subject: platform_data/mlxreg: Add new type to support modular systems

Add new types for the Nvidia modular systems MSN4800 which could
be equipped with the different types of replaceable line cards.

Add new type to specify the kind of hotplug events for the line cards.
The line card events are generated by the programmable device located
on the main board. This device implements interrupt controller logic.
Line card interrupts are associated with different line cards states
during its initialization: insertion, security signature validation,
power good state, security validation, hardware-firmware
synchronization state, line card PHYs readiness state, firmware
availability for line card ports. Also under some circumstances
hardware can generate thermal shutdown for particular line card.

Add new type specifying the action, which should be performed when
particular hotplug event is received. This action defines in which way
hotplug event should be handled by hotplug driver. There are the next
actions types:
- Connect I2C device with empty 'platform_data' field according to the
  platform topology, if device is configured (for example, power unit
  micro-controller driver, when power unit is connected to power source
  (this is what is currently supported).
- Connect device with 'platform_data' field set according to the
  platform topology. The purpose is to pass 'platform_data' through
  hotplug driver to underlying device (for example line card driver).
- No device is associated with hotplug event - just send "udev" event
 (this is what is currently supported).

Extend structure 'mlxreg_hotplug_device' with hotplug action field.

Extend structure 'mlxreg_core_data' with:
- Registers for line card power and enabling control.
- Slot number field, to indicate at which physical slot replaceable
  line card device is located.

Signed-off-by: Vadim Pasternak <vadimp@nvidia.com>
Reviewed-by: Michael Shych <michaelsh@nvidia.com>
Link: https://lore.kernel.org/r/20211002093238.3771419-2-vadimp@nvidia.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/mlxreg.h | 57 ++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h
index 101333fe2b8d..49f0e15a10dd 100644
--- a/include/linux/platform_data/mlxreg.h
+++ b/include/linux/platform_data/mlxreg.h
@@ -24,6 +24,51 @@ enum mlxreg_wdt_type {
 	MLX_WDT_TYPE3,
 };
 
+/**
+ * enum mlxreg_hotplug_kind - kind of hotplug entry
+ *
+ * @MLXREG_HOTPLUG_DEVICE_NA: do not care;
+ * @MLXREG_HOTPLUG_LC_PRESENT: entry for line card presence in/out events;
+ * @MLXREG_HOTPLUG_LC_VERIFIED: entry for line card verification status events
+ *				coming after line card security signature validation;
+ * @MLXREG_HOTPLUG_LC_POWERED: entry for line card power on/off events;
+ * @MLXREG_HOTPLUG_LC_SYNCED: entry for line card synchronization events, coming
+ *			      after hardware-firmware synchronization handshake;
+ * @MLXREG_HOTPLUG_LC_READY: entry for line card ready events, indicating line card
+			     PHYs ready / unready state;
+ * @MLXREG_HOTPLUG_LC_ACTIVE: entry for line card active events, indicating firmware
+ *			      availability / unavailability for the ports on line card;
+ * @MLXREG_HOTPLUG_LC_THERMAL: entry for line card thermal shutdown events, positive
+ *			       event indicates that system should power off the line
+ *			       card for which this event has been received;
+ */
+enum mlxreg_hotplug_kind {
+	MLXREG_HOTPLUG_DEVICE_NA = 0,
+	MLXREG_HOTPLUG_LC_PRESENT = 1,
+	MLXREG_HOTPLUG_LC_VERIFIED = 2,
+	MLXREG_HOTPLUG_LC_POWERED = 3,
+	MLXREG_HOTPLUG_LC_SYNCED = 4,
+	MLXREG_HOTPLUG_LC_READY = 5,
+	MLXREG_HOTPLUG_LC_ACTIVE = 6,
+	MLXREG_HOTPLUG_LC_THERMAL = 7,
+};
+
+/**
+ * enum mlxreg_hotplug_device_action - hotplug device action required for
+ *				       driver's connectivity
+ *
+ * @MLXREG_HOTPLUG_DEVICE_DEFAULT_ACTION: probe device for 'on' event, remove
+ *					  for 'off' event;
+ * @MLXREG_HOTPLUG_DEVICE_PLATFORM_ACTION: probe platform device for 'on'
+ *					   event, remove for 'off' event;
+ * @MLXREG_HOTPLUG_DEVICE_NO_ACTION: no connectivity action is required;
+ */
+enum mlxreg_hotplug_device_action {
+	MLXREG_HOTPLUG_DEVICE_DEFAULT_ACTION = 0,
+	MLXREG_HOTPLUG_DEVICE_PLATFORM_ACTION = 1,
+	MLXREG_HOTPLUG_DEVICE_NO_ACTION = 2,
+};
+
 /**
  * struct mlxreg_hotplug_device - I2C device data:
  *
@@ -31,6 +76,7 @@ enum mlxreg_wdt_type {
  * @client: I2C device client;
  * @brdinfo: device board information;
  * @nr: I2C device adapter number, to which device is to be attached;
+ * @action: action to be performed upon event receiving;
  *
  * Structure represents I2C hotplug device static data (board topology) and
  * dynamic data (related kernel objects handles).
@@ -40,6 +86,7 @@ struct mlxreg_hotplug_device {
 	struct i2c_client *client;
 	struct i2c_board_info *brdinfo;
 	int nr;
+	enum mlxreg_hotplug_device_action action;
 };
 
 /**
@@ -51,12 +98,16 @@ struct mlxreg_hotplug_device {
  * @bit: attribute effective bit;
  * @capability: attribute capability register;
  * @reg_prsnt: attribute presence register;
+ * @reg_sync: attribute synch register;
+ * @reg_pwr: attribute power register;
+ * @reg_ena: attribute enable register;
  * @mode: access mode;
  * @np - pointer to node platform associated with attribute;
  * @hpdev - hotplug device data;
  * @health_cntr: dynamic device health indication counter;
  * @attached: true if device has been attached after good health indication;
  * @regnum: number of registers occupied by multi-register attribute;
+ * @slot: slot number, at which device is located;
  */
 struct mlxreg_core_data {
 	char label[MLXREG_CORE_LABEL_MAX_SIZE];
@@ -65,18 +116,23 @@ struct mlxreg_core_data {
 	u32 bit;
 	u32 capability;
 	u32 reg_prsnt;
+	u32 reg_sync;
+	u32 reg_pwr;
+	u32 reg_ena;
 	umode_t	mode;
 	struct device_node *np;
 	struct mlxreg_hotplug_device hpdev;
 	u32 health_cntr;
 	bool attached;
 	u8 regnum;
+	u8 slot;
 };
 
 /**
  * struct mlxreg_core_item - same type components controlled by the driver:
  *
  * @data: component data;
+ * @kind: kind of hotplug attribute;
  * @aggr_mask: group aggregation mask;
  * @reg: group interrupt status register;
  * @mask: group interrupt mask;
@@ -89,6 +145,7 @@ struct mlxreg_core_data {
  */
 struct mlxreg_core_item {
 	struct mlxreg_core_data *data;
+	enum mlxreg_hotplug_kind kind;
 	u32 aggr_mask;
 	u32 reg;
 	u32 mask;
-- 
cgit v1.2.3


From bb1023b6da379985ff888dcd7bc601735d02267a Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@nvidia.com>
Date: Sat, 2 Oct 2021 12:32:32 +0300
Subject: platform/mellanox: mlxreg-hotplug: Extend logic for hotplug devices
 operations

Extend the structure 'mlxreg_hotplug_device" with platform device field
to allow transition of the register map and system interrupt line number
to underlying hotplug devices, sharing the same register map and
same interrupt line with 'mlxreg-hotplug' driver.

Extend logic for hotplug devices creation and removing according to
the action associated with the hotplug device description. Previously
hotplug driver was capable to attach / de-attach upon hotplug events
only I2C devices handled by simple I2C drivers. Now it should be able
to attach also devices handled by the platform drivers.

The motivation is to allow transition of platform data like:
- system interrupt line number, sharing with 'mlxreg-hotplug' to
  underlying hotplug devices.
- shared register map of programmable devices on main board to
  underlying hotplug devices.

Additioanlly the number of 'sysfs' attributes is increased, since
modular system defines more 'sysfs' attributes.

Signed-off-by: Vadim Pasternak <vadimp@nvidia.com>
Reviewed-by: Michael Shych <michaelsh@nvidia.com>
Link: https://lore.kernel.org/r/20211002093238.3771419-4-vadimp@nvidia.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/mlxreg.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h
index 49f0e15a10dd..3122d550dc00 100644
--- a/include/linux/platform_data/mlxreg.h
+++ b/include/linux/platform_data/mlxreg.h
@@ -69,6 +69,19 @@ enum mlxreg_hotplug_device_action {
 	MLXREG_HOTPLUG_DEVICE_NO_ACTION = 2,
 };
 
+/**
+ * struct mlxreg_core_hotplug_notifier - hotplug notifier block:
+ *
+ * @identity: notifier identity name;
+ * @handle: user handle to be passed by user handler function;
+ * @user_handler: user handler function associated with the event;
+ */
+struct mlxreg_core_hotplug_notifier {
+	char identity[MLXREG_CORE_LABEL_MAX_SIZE];
+	void *handle;
+	int (*user_handler)(void *handle, enum mlxreg_hotplug_kind kind, u8 action);
+};
+
 /**
  * struct mlxreg_hotplug_device - I2C device data:
  *
@@ -76,7 +89,11 @@ enum mlxreg_hotplug_device_action {
  * @client: I2C device client;
  * @brdinfo: device board information;
  * @nr: I2C device adapter number, to which device is to be attached;
+ * @pdev: platform device, if device is instantiated as a platform device;
  * @action: action to be performed upon event receiving;
+ * @handle: user handle to be passed by user handler function;
+ * @user_handler: user handler function associated with the event;
+ * @notifier: pointer to event notifier block;
  *
  * Structure represents I2C hotplug device static data (board topology) and
  * dynamic data (related kernel objects handles).
@@ -86,7 +103,11 @@ struct mlxreg_hotplug_device {
 	struct i2c_client *client;
 	struct i2c_board_info *brdinfo;
 	int nr;
+	struct platform_device *pdev;
 	enum mlxreg_hotplug_device_action action;
+	void *handle;
+	int (*user_handler)(void *handle, enum mlxreg_hotplug_kind kind, u8 action);
+	struct mlxreg_core_hotplug_notifier *notifier;
 };
 
 /**
@@ -104,10 +125,12 @@ struct mlxreg_hotplug_device {
  * @mode: access mode;
  * @np - pointer to node platform associated with attribute;
  * @hpdev - hotplug device data;
+ * @notifier: pointer to event notifier block;
  * @health_cntr: dynamic device health indication counter;
  * @attached: true if device has been attached after good health indication;
  * @regnum: number of registers occupied by multi-register attribute;
  * @slot: slot number, at which device is located;
+ * @secured: if set indicates that entry access is secured;
  */
 struct mlxreg_core_data {
 	char label[MLXREG_CORE_LABEL_MAX_SIZE];
@@ -122,6 +145,7 @@ struct mlxreg_core_data {
 	umode_t	mode;
 	struct device_node *np;
 	struct mlxreg_hotplug_device hpdev;
+	struct mlxreg_core_hotplug_notifier *notifier;
 	u32 health_cntr;
 	bool attached;
 	u8 regnum;
-- 
cgit v1.2.3


From 9d93d7877c9158d059028681b66a0c192f85c64d Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@nvidia.com>
Date: Sat, 2 Oct 2021 12:32:35 +0300
Subject: platform_data/mlxreg: Add new field for secured access

Extend structure 'mlxreg_core_data' with the field "secured". The
purpose of this field is to restrict access to some attributes, if
kernel is configured with security options, like:
LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY.
Access to some attributes, which for example, allow burning of some
hardware components, like FPGA, CPLD, SPI, etcetera can break the
system. In case user does not want to allow such access, it can disable
it by setting security options.

Signed-off-by: Vadim Pasternak <vadimp@nvidia.com>
Reviewed-by: Michael Shych <michaelsh@nvidia.com>
Link: https://lore.kernel.org/r/20211002093238.3771419-7-vadimp@nvidia.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/mlxreg.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h
index 3122d550dc00..40185f9d7c14 100644
--- a/include/linux/platform_data/mlxreg.h
+++ b/include/linux/platform_data/mlxreg.h
@@ -150,6 +150,7 @@ struct mlxreg_core_data {
 	bool attached;
 	u8 regnum;
 	u8 slot;
+	u8 secured;
 };
 
 /**
-- 
cgit v1.2.3


From 54f5b3615f199673ecb23f0a6847fd9c43aaa012 Mon Sep 17 00:00:00 2001
From: Wang Kefeng <wangkefeng.wang@huawei.com>
Date: Mon, 23 Aug 2021 10:41:41 +0100
Subject: ARM: 9121/1: amba: Drop unused functions about APB/AHB devices add

No one use the following functions, kill them.

  amba_aphb_device_add()
  amba_apb_device_add()
  amba_apb_device_add_res()
  amba_ahb_device_add()
  amba_ahb_device_add_res()

Cc: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 include/linux/amba/bus.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index c68d87b87283..edfcf7a14dcd 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -122,24 +122,6 @@ struct amba_device *amba_device_alloc(const char *, resource_size_t, size_t);
 void amba_device_put(struct amba_device *);
 int amba_device_add(struct amba_device *, struct resource *);
 int amba_device_register(struct amba_device *, struct resource *);
-struct amba_device *amba_apb_device_add(struct device *parent, const char *name,
-					resource_size_t base, size_t size,
-					int irq1, int irq2, void *pdata,
-					unsigned int periphid);
-struct amba_device *amba_ahb_device_add(struct device *parent, const char *name,
-					resource_size_t base, size_t size,
-					int irq1, int irq2, void *pdata,
-					unsigned int periphid);
-struct amba_device *
-amba_apb_device_add_res(struct device *parent, const char *name,
-			resource_size_t base, size_t size, int irq1,
-			int irq2, void *pdata, unsigned int periphid,
-			struct resource *resbase);
-struct amba_device *
-amba_ahb_device_add_res(struct device *parent, const char *name,
-			resource_size_t base, size_t size, int irq1,
-			int irq2, void *pdata, unsigned int periphid,
-			struct resource *resbase);
 void amba_device_unregister(struct amba_device *);
 struct amba_device *amba_find_device(const char *, struct device *, unsigned int, unsigned int);
 int amba_request_regions(struct amba_device *, const char *);
-- 
cgit v1.2.3


From f5245a5fdf757f50a6c905fc16cceb1a6146ccf5 Mon Sep 17 00:00:00 2001
From: David Lechner <david@lechnology.com>
Date: Sun, 17 Oct 2021 13:55:21 -0500
Subject: counter: drop chrdev_lock

This removes the chrdev_lock from the counter subsystem. This was
intended to prevent opening the chrdev more than once. However, this
doesn't work in practice since userspace can duplicate file descriptors
and pass file descriptors to other processes. Since this protection
can't be relied on, it is best to just remove it.

Suggested-by: Greg KH <gregkh@linuxfoundation.org>
Acked-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: David Lechner <david@lechnology.com>
Link: https://lore.kernel.org/r/20211017185521.3468640-1-david@lechnology.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/counter.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index 22b14a552b1d..0fd99e255a50 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -297,7 +297,6 @@ struct counter_ops {
  * @events:		queue of detected Counter events
  * @events_wait:	wait queue to allow blocking reads of Counter events
  * @events_lock:	lock to protect Counter events queue read operations
- * @chrdev_lock:	lock to limit chrdev to a single open at a time
  * @ops_exist_lock:	lock to prevent use during removal
  */
 struct counter_device {
@@ -325,12 +324,6 @@ struct counter_device {
 	DECLARE_KFIFO_PTR(events, struct counter_event);
 	wait_queue_head_t events_wait;
 	struct mutex events_lock;
-	/*
-	 * chrdev_lock is locked by counter_chrdev_open() and unlocked by
-	 * counter_chrdev_release(), so a mutex is not possible here because
-	 * chrdev_lock will invariably be held when returning to user space
-	 */
-	atomic_t chrdev_lock;
 	struct mutex ops_exist_lock;
 };
 
-- 
cgit v1.2.3


From 5c67aa59bd8f99d70c81b5e71bd02083056a8486 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 14 Oct 2021 16:26:11 +0300
Subject: mmc: sdhci-pci: Remove dead code (struct sdhci_pci_data et al)

The last user of this struct gone a couple of releases ago.
Besides that there were not so many users of this API for
more than 10 years:

1/ The one is Intel Merrifield, that had been added 2016-08-31
   by the commit 3976b0380b31 ("x86/platform/intel-mid: Enable
   SD card detection on Merrifield") and removed 2021-02-11 by
   the commit 4590d98f5a4f ("sfi: Remove framework for deprecated
   firmware").

2/ The other is Intel Sunrisepoint related, that had been added
   2015-02-06 by the commit e1bfad6d936d ("mmc: sdhci-pci: Add
   support for drive strength selection for SPT") and removed
   2017-03-20 by the commit 51ced59cc02e ("mmc: sdhci-pci: Use
   ACPI DSM to get driver strength for some Intel devices").

Effectively this is a revert of the commit 52c506f0bc72 ("mmc:
sdhci-pci: add platform data").

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Link: https://lore.kernel.org/r/20211014132613.27861-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/sdhci-pci-data.h | 18 ------------------
 1 file changed, 18 deletions(-)
 delete mode 100644 include/linux/mmc/sdhci-pci-data.h

(limited to 'include/linux')

diff --git a/include/linux/mmc/sdhci-pci-data.h b/include/linux/mmc/sdhci-pci-data.h
deleted file mode 100644
index 1d42872d22f3..000000000000
--- a/include/linux/mmc/sdhci-pci-data.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_MMC_SDHCI_PCI_DATA_H
-#define LINUX_MMC_SDHCI_PCI_DATA_H
-
-struct pci_dev;
-
-struct sdhci_pci_data {
-	struct pci_dev	*pdev;
-	int		slotno;
-	int		rst_n_gpio; /* Set to -EINVAL if unused */
-	int		cd_gpio;    /* Set to -EINVAL if unused */
-	int		(*setup)(struct sdhci_pci_data *data);
-	void		(*cleanup)(struct sdhci_pci_data *data);
-};
-
-extern struct sdhci_pci_data *(*sdhci_pci_get_data)(struct pci_dev *pdev,
-				int slotno);
-#endif
-- 
cgit v1.2.3


From cf6a8b1b24d675afc35a01cccd081160014a0125 Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Tue, 12 Oct 2021 13:26:30 +0300
Subject: RDMA/mlx5: Remove iova from struct mlx5_core_mkey

iova is already stored in ibmr->iova, no need to store it here.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e23417424373..669904f9986e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -364,7 +364,6 @@ enum {
 };
 
 struct mlx5_core_mkey {
-	u64			iova;
 	u64			size;
 	u32			key;
 	u32			pd;
-- 
cgit v1.2.3


From 062fd731e51ee29ba745b2fd1c7ac87dd460d4ca Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Tue, 12 Oct 2021 13:26:31 +0300
Subject: RDMA/mlx5: Remove size from struct mlx5_core_mkey

mkey->size is already stored in ibmr->length, no need to store it here.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 669904f9986e..ff1e991314e2 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -364,7 +364,6 @@ enum {
 };
 
 struct mlx5_core_mkey {
-	u64			size;
 	u32			key;
 	u32			pd;
 	u32			type;
-- 
cgit v1.2.3


From c64674168b6a2f293e92caf33c917ccf10886801 Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Tue, 12 Oct 2021 13:26:32 +0300
Subject: RDMA/mlx5: Remove pd from struct mlx5_core_mkey

There is no read of mkey->pd, only write. Remove it.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ff1e991314e2..f0ce7d4dc4ff 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -365,7 +365,6 @@ enum {
 
 struct mlx5_core_mkey {
 	u32			key;
-	u32			pd;
 	u32			type;
 	struct wait_queue_head wait;
 	refcount_t usecount;
-- 
cgit v1.2.3


From 83fec3f12a5904b62330fd1a89af6d892afc387e Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Tue, 12 Oct 2021 13:26:33 +0300
Subject: RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key

In mlx5_core and vdpa there is no use of mlx5_core_mkey members except
for the key itself.

As preparation for moving mlx5_core_mkey to mlx5_ib, the occurrences of
struct mlx5_core_mkey in all modules except for mlx5_ib are replaced by
a u32 key.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/driver.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f0ce7d4dc4ff..bd99f713720b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -650,7 +650,7 @@ struct mlx5e_resources {
 	struct mlx5e_hw_objs {
 		u32                        pdn;
 		struct mlx5_td             td;
-		struct mlx5_core_mkey      mkey;
+		u32			   mkey;
 		struct mlx5_sq_bfreg       bfreg;
 	} hw_objs;
 	struct devlink_port dl_port;
@@ -1021,13 +1021,11 @@ struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 						      gfp_t flags, int npages);
 void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 				 struct mlx5_cmd_mailbox *head);
-int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
-			  struct mlx5_core_mkey *mkey,
-			  u32 *in, int inlen);
-int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
-			   struct mlx5_core_mkey *mkey);
-int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey,
-			 u32 *out, int outlen);
+int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in,
+			  int inlen);
+int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, u32 mkey);
+int mlx5_core_query_mkey(struct mlx5_core_dev *dev, u32 mkey, u32 *out,
+			 int outlen);
 int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn);
 int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn);
 int mlx5_pagealloc_init(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From 4123bfb0b28b77b944360be8c758b1a0974e96ad Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Tue, 12 Oct 2021 13:26:34 +0300
Subject: RDMA/mlx5: Move struct mlx5_core_mkey to mlx5_ib

Move mlx5_core_mkey struct to mlx5_ib, as the mlx5_core doesn't use it
at this point.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/driver.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index bd99f713720b..70b6aff4940a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -357,19 +357,6 @@ struct mlx5_core_sig_ctx {
 	u32			sigerr_count;
 };
 
-enum {
-	MLX5_MKEY_MR = 1,
-	MLX5_MKEY_MW,
-	MLX5_MKEY_INDIRECT_DEVX,
-};
-
-struct mlx5_core_mkey {
-	u32			key;
-	u32			type;
-	struct wait_queue_head wait;
-	refcount_t usecount;
-};
-
 #define MLX5_24BIT_MASK		((1 << 24) - 1)
 
 enum mlx5_res_type {
-- 
cgit v1.2.3


From e80094a473eefad9d856ce3ab0d7afdbb64800c4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 18 Oct 2021 14:10:02 -0700
Subject: ethernet: add a helper for assigning port addresses

We have 5 drivers which offset base MAC addr by port id.
Create a helper for them.

This helper takes care of overflows, which some drivers
did not do, please complain if that's going to break
anything!

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Shannon Nelson <snelson@pensando.io>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 23681c3d3b8a..2ad71cc90b37 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -551,6 +551,27 @@ static inline unsigned long compare_ether_header(const void *a, const void *b)
 #endif
 }
 
+/**
+ * eth_hw_addr_gen - Generate and assign Ethernet address to a port
+ * @dev: pointer to port's net_device structure
+ * @base_addr: base Ethernet address
+ * @id: offset to add to the base address
+ *
+ * Generate a MAC address using a base address and an offset and assign it
+ * to a net_device. Commonly used by switch drivers which need to compute
+ * addresses for all their ports. addr_assign_type is not changed.
+ */
+static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr,
+				   unsigned int id)
+{
+	u64 u = ether_addr_to_u64(base_addr);
+	u8 addr[ETH_ALEN];
+
+	u += id;
+	u64_to_ether_addr(u, addr);
+	eth_hw_addr_set(dev, addr);
+}
+
 /**
  * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame
  * @skb: Buffer to pad
-- 
cgit v1.2.3


From db9a02baa23267c695a44234a0f2f4607992780e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 6 Oct 2021 06:15:04 -0600
Subject: block: move bdev_read_only() into the header

This is called for every write in the fast path, move it inline next
to get_disk_ro() which is called internally.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index cd4038fd5743..c70bc5fce4db 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -221,6 +221,11 @@ static inline int get_disk_ro(struct gendisk *disk)
 		test_bit(GD_READ_ONLY, &disk->state);
 }
 
+static inline int bdev_read_only(struct block_device *bdev)
+{
+	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
+}
+
 extern void disk_block_events(struct gendisk *disk);
 extern void disk_unblock_events(struct gendisk *disk);
 extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
-- 
cgit v1.2.3


From e028f167eca5fc56938e6c1680d40eed0bc39e80 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 16 Oct 2021 16:38:14 -0600
Subject: block: move blk_mq_tag_to_rq() inline

This is in the fast path of driver issue or completion, and it's a single
array index operation. Move it inline to avoid a function call for it.

This does mean making struct blk_mq_tags block layer public, but there's
not really much in there.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 656fe34bdb6c..6cf35de151a9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -7,6 +7,7 @@
 #include <linux/srcu.h>
 #include <linux/lockdep.h>
 #include <linux/scatterlist.h>
+#include <linux/prefetch.h>
 
 struct blk_mq_tags;
 struct blk_flush_queue;
@@ -675,7 +676,40 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		unsigned int op, blk_mq_req_flags_t flags,
 		unsigned int hctx_idx);
-struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
+
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+	unsigned int nr_tags;
+	unsigned int nr_reserved_tags;
+
+	atomic_t active_queues;
+
+	struct sbitmap_queue bitmap_tags;
+	struct sbitmap_queue breserved_tags;
+
+	struct request **rqs;
+	struct request **static_rqs;
+	struct list_head page_list;
+
+	/*
+	 * used to clear request reference in rqs[] before freeing one
+	 * request pool
+	 */
+	spinlock_t lock;
+};
+
+static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
+					       unsigned int tag)
+{
+	if (tag < tags->nr_tags) {
+		prefetch(tags->rqs[tag]);
+		return tags->rqs[tag];
+	}
+
+	return NULL;
+}
 
 enum {
 	BLK_MQ_UNIQUE_TAG_BITS = 16,
-- 
cgit v1.2.3


From bc490f81731e181b07b8d7577425c06ae91692c8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Oct 2021 10:12:12 -0600
Subject: block: change plugging to use a singly linked list

Use a singly linked list for the blk_plug. This saves 8 bytes in the
blk_plug struct, and makes for faster list manipulations than doubly
linked lists. As we don't use the doubly linked lists for anything,
singly linked is just fine.

This yields a bump in default (merging enabled) performance from 7.0
to 7.1M IOPS, and ~7.5M IOPS with merging disabled.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fd9771a1da09..4027112b9851 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -728,7 +728,7 @@ extern void blk_set_queue_dying(struct request_queue *);
  * schedule() where blk_schedule_flush_plug() is called.
  */
 struct blk_plug {
-	struct list_head mq_list; /* blk-mq requests */
+	struct request *mq_list; /* blk-mq requests */
 
 	/* if ios_left is > 1, we can batch tag/rq allocations */
 	struct request *cached_rq;
@@ -777,8 +777,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 	struct blk_plug *plug = tsk->plug;
 
 	return plug &&
-		 (!list_empty(&plug->mq_list) ||
-		 !list_empty(&plug->cb_list));
+		 (plug->mq_list || !list_empty(&plug->cb_list));
 }
 
 int blkdev_issue_flush(struct block_device *bdev);
-- 
cgit v1.2.3


From dc5fc361d891e089dfd9c0a975dc78041036b906 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 19 Oct 2021 06:02:30 -0600
Subject: block: attempt direct issue of plug list

If we have just one queue type in the plug list, then we can extend our
direct issue to cover a full plug list as well. This allows sending a
batch of requests for direct issue, which is more efficient than doing
one-at-a-time kind of issue.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4027112b9851..f13091d3d476 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -737,6 +737,7 @@ struct blk_plug {
 	unsigned short rq_count;
 
 	bool multiple_queues;
+	bool has_elevator;
 	bool nowait;
 
 	struct list_head cb_list; /* md requires an unplug callback */
-- 
cgit v1.2.3


From cd45c9bf8b43cd387e167cf166ae5c517f56d658 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Mon, 18 Oct 2021 16:33:22 +0200
Subject: ASoC: Intel: Move soc_intel_is_foo() helpers to a generic header

The soc_intel_is_foo() helpers from
sound/soc/intel/common/soc-intel-quirks.h are useful outside of the
sound subsystem too.

Move these to include/linux/platform_data/x86/soc.h, so that
other code can use them too.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20211018143324.296961-2-hdegoede@redhat.com
---
 include/linux/platform_data/x86/soc.h | 65 +++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 include/linux/platform_data/x86/soc.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/soc.h b/include/linux/platform_data/x86/soc.h
new file mode 100644
index 000000000000..da05f425587a
--- /dev/null
+++ b/include/linux/platform_data/x86/soc.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Helpers for Intel SoC model detection
+ *
+ * Copyright (c) 2019, Intel Corporation.
+ */
+
+#ifndef __PLATFORM_DATA_X86_SOC_H
+#define __PLATFORM_DATA_X86_SOC_H
+
+#if IS_ENABLED(CONFIG_X86)
+
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+#define SOC_INTEL_IS_CPU(soc, type)				\
+static inline bool soc_intel_is_##soc(void)			\
+{								\
+	static const struct x86_cpu_id soc##_cpu_ids[] = {	\
+		X86_MATCH_INTEL_FAM6_MODEL(type, NULL),		\
+		{}						\
+	};							\
+	const struct x86_cpu_id *id;				\
+								\
+	id = x86_match_cpu(soc##_cpu_ids);			\
+	if (id)							\
+		return true;					\
+	return false;						\
+}
+
+SOC_INTEL_IS_CPU(byt, ATOM_SILVERMONT);
+SOC_INTEL_IS_CPU(cht, ATOM_AIRMONT);
+SOC_INTEL_IS_CPU(apl, ATOM_GOLDMONT);
+SOC_INTEL_IS_CPU(glk, ATOM_GOLDMONT_PLUS);
+SOC_INTEL_IS_CPU(cml, KABYLAKE_L);
+
+#else /* IS_ENABLED(CONFIG_X86) */
+
+static inline bool soc_intel_is_byt(void)
+{
+	return false;
+}
+
+static inline bool soc_intel_is_cht(void)
+{
+	return false;
+}
+
+static inline bool soc_intel_is_apl(void)
+{
+	return false;
+}
+
+static inline bool soc_intel_is_glk(void)
+{
+	return false;
+}
+
+static inline bool soc_intel_is_cml(void)
+{
+	return false;
+}
+#endif /* IS_ENABLED(CONFIG_X86) */
+
+#endif /* __PLATFORM_DATA_X86_SOC_H */
-- 
cgit v1.2.3


From aaa2975f2b07b04ee16b2cad1072cbdea3e1c50a Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Mon, 11 Oct 2021 05:31:42 +0800
Subject: lib/xz: Add MicroLZMA decoder

MicroLZMA is a yet another header format variant where the first
byte of a raw LZMA stream (without the end of stream marker) has
been replaced with a bitwise-negation of the lc/lp/pb properties
byte. MicroLZMA was created to be used in EROFS but can be used
by other things too where wasting minimal amount of space for
headers is important.

This is implemented using most of the LZMA2 code as is so the
amount of new code is small. The API has a few extra features
compared to the XZ decoder. On the other hand, the API lacks
XZ_BUF_ERROR support which is important to take into account
when using this API.

MicroLZMA doesn't support BCJ filters. In theory they could be
added later as there are many unused/reserved values for the
first byte of the compressed stream but in practice it is
somewhat unlikely to happen due to a few implementation reasons.

Link: https://lore.kernel.org/r/20211010213145.17462-5-xiang@kernel.org
Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 include/linux/xz.h | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/xz.h b/include/linux/xz.h
index 9884c8440188..7285ca5d56e9 100644
--- a/include/linux/xz.h
+++ b/include/linux/xz.h
@@ -233,6 +233,112 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s);
  */
 XZ_EXTERN void xz_dec_end(struct xz_dec *s);
 
+/*
+ * Decompressor for MicroLZMA, an LZMA variant with a very minimal header.
+ * See xz_dec_microlzma_alloc() below for details.
+ *
+ * These functions aren't used or available in preboot code and thus aren't
+ * marked with XZ_EXTERN. This avoids warnings about static functions that
+ * are never defined.
+ */
+/**
+ * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state
+ */
+struct xz_dec_microlzma;
+
+/**
+ * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder
+ * @mode        XZ_SINGLE or XZ_PREALLOC
+ * @dict_size   LZMA dictionary size. This must be at least 4 KiB and
+ *              at most 3 GiB.
+ *
+ * In contrast to xz_dec_init(), this function only allocates the memory
+ * and remembers the dictionary size. xz_dec_microlzma_reset() must be used
+ * before calling xz_dec_microlzma_run().
+ *
+ * The amount of allocated memory is a little less than 30 KiB with XZ_SINGLE.
+ * With XZ_PREALLOC also a dictionary buffer of dict_size bytes is allocated.
+ *
+ * On success, xz_dec_microlzma_alloc() returns a pointer to
+ * struct xz_dec_microlzma. If memory allocation fails or
+ * dict_size is invalid, NULL is returned.
+ *
+ * The compressed format supported by this decoder is a raw LZMA stream
+ * whose first byte (always 0x00) has been replaced with bitwise-negation
+ * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is
+ * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00.
+ * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream
+ * marker must not be used. The unused values are reserved for future use.
+ * This MicroLZMA header format was created for use in EROFS but may be used
+ * by others too.
+ */
+extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
+						       uint32_t dict_size);
+
+/**
+ * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state
+ * @s           Decoder state allocated using xz_dec_microlzma_alloc()
+ * @comp_size   Compressed size of the input stream
+ * @uncomp_size Uncompressed size of the input stream. A value smaller
+ *              than the real uncompressed size of the input stream can
+ *              be specified if uncomp_size_is_exact is set to false.
+ *              uncomp_size can never be set to a value larger than the
+ *              expected real uncompressed size because it would eventually
+ *              result in XZ_DATA_ERROR.
+ * @uncomp_size_is_exact  This is an int instead of bool to avoid
+ *              requiring stdbool.h. This should normally be set to true.
+ *              When this is set to false, error detection is weaker.
+ */
+extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s,
+				   uint32_t comp_size, uint32_t uncomp_size,
+				   int uncomp_size_is_exact);
+
+/**
+ * xz_dec_microlzma_run() - Run the MicroLZMA decoder
+ * @s           Decoder state initialized using xz_dec_microlzma_reset()
+ * @b:          Input and output buffers
+ *
+ * This works similarly to xz_dec_run() with a few important differences.
+ * Only the differences are documented here.
+ *
+ * The only possible return values are XZ_OK, XZ_STREAM_END, and
+ * XZ_DATA_ERROR. This function cannot return XZ_BUF_ERROR: if no progress
+ * is possible due to lack of input data or output space, this function will
+ * keep returning XZ_OK. Thus, the calling code must be written so that it
+ * will eventually provide input and output space matching (or exceeding)
+ * comp_size and uncomp_size arguments given to xz_dec_microlzma_reset().
+ * If the caller cannot do this (for example, if the input file is truncated
+ * or otherwise corrupt), the caller must detect this error by itself to
+ * avoid an infinite loop.
+ *
+ * If the compressed data seems to be corrupt, XZ_DATA_ERROR is returned.
+ * This can happen also when incorrect dictionary, uncompressed, or
+ * compressed sizes have been specified.
+ *
+ * With XZ_PREALLOC only: As an extra feature, b->out may be NULL to skip over
+ * uncompressed data. This way the caller doesn't need to provide a temporary
+ * output buffer for the bytes that will be ignored.
+ *
+ * With XZ_SINGLE only: In contrast to xz_dec_run(), the return value XZ_OK
+ * is also possible and thus XZ_SINGLE is actually a limited multi-call mode.
+ * After XZ_OK the bytes decoded so far may be read from the output buffer.
+ * It is possible to continue decoding but the variables b->out and b->out_pos
+ * MUST NOT be changed by the caller. Increasing the value of b->out_size is
+ * allowed to make more output space available; one doesn't need to provide
+ * space for the whole uncompressed data on the first call. The input buffer
+ * may be changed normally like with XZ_PREALLOC. This way input data can be
+ * provided from non-contiguous memory.
+ */
+extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s,
+					struct xz_buf *b);
+
+/**
+ * xz_dec_microlzma_end() - Free the memory allocated for the decoder state
+ * @s:          Decoder state allocated using xz_dec_microlzma_alloc().
+ *              If s is NULL, this function does nothing.
+ */
+extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s);
+
 /*
  * Standalone build (userspace build or in-kernel build for boot time use)
  * needs a CRC32 implementation. For normal in-kernel use, kernel's own
-- 
cgit v1.2.3


From e70feb8b3e6886c525c88943b5f1508d02f5a683 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 14 Oct 2021 16:17:10 +0800
Subject: blk-mq: support concurrent queue quiesce/unquiesce

blk_mq_quiesce_queue() has been used a bit wide now, so far we don't support
concurrent/nested quiesce. One biggest issue is that unquiesce can happen
unexpectedly in case that quiesce/unquiesce are run concurrently from
more than one context.

This patch introduces q->mq_quiesce_depth to deal concurrent quiesce,
and we only unquiesce queue when it is the last/outer-most one of all
contexts.

Several kernel panic issue has been reported[1][2][3] when running stress
quiesce test. And this patch has been verified in these reports.

[1] https://lore.kernel.org/linux-block/9b21c797-e505-3821-4f5b-df7bf9380328@huawei.com/T/#m1fc52431fad7f33b1ffc3f12c4450e4238540787
[2] https://lore.kernel.org/linux-block/9b21c797-e505-3821-4f5b-df7bf9380328@huawei.com/T/#m10ad90afeb9c8cc318334190a7c24c8b5c5e0722
[3] https://listman.redhat.com/archives/dm-devel/2021-September/msg00189.html

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211014081710.1871747-7-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f13091d3d476..2b22fa36e568 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -315,6 +315,8 @@ struct request_queue {
 	 */
 	struct mutex		mq_freeze_lock;
 
+	int			quiesce_depth;
+
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
 	struct bio_set		bio_split;
-- 
cgit v1.2.3


From 34cdd18b8d245f3e901e5325313c27de727ab80d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 17 Jun 2020 16:56:16 -0400
Subject: tracing: Use linker magic instead of recasting ftrace_ops_list_func()

In an effort to enable -Wcast-function-type in the top-level Makefile to
support Control Flow Integrity builds, all function casts need to be
removed.

This means that ftrace_ops_list_func() can no longer be defined as
ftrace_ops_no_ops(). The reason for ftrace_ops_no_ops() is to use that when
an architecture calls ftrace_ops_list_func() with only two parameters
(called from assembly). And to make sure there's no C side-effects, those
archs call ftrace_ops_no_ops() which only has two parameters, as
ftrace_ops_list_func() has four parameters.

Instead of a typecast, use vmlinux.lds.h to define ftrace_ops_list_func() to
arch_ftrace_ops_list_func() that will define the proper set of parameters.

Link: https://lore.kernel.org/r/20200614070154.6039-1-oscar.carter@gmx.com
Link: https://lkml.kernel.org/r/20200617165616.52241bde@oasis.local.home
Link: https://lore.kernel.org/all/20211005053922.GA702049@embeddedor/

Requested-by: Oscar Carter <oscar.carter@gmx.com>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 832e65f06754..12fcfa2d23ea 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -30,16 +30,26 @@
 #define ARCH_SUPPORTS_FTRACE_OPS 0
 #endif
 
+#ifdef CONFIG_FUNCTION_TRACER
+struct ftrace_ops;
+struct ftrace_regs;
 /*
  * If the arch's mcount caller does not support all of ftrace's
  * features, then it must call an indirect function that
  * does. Or at least does enough to prevent any unwelcome side effects.
+ *
+ * Also define the function prototype that these architectures use
+ * to call the ftrace_ops_list_func().
  */
 #if !ARCH_SUPPORTS_FTRACE_OPS
 # define FTRACE_FORCE_LIST_FUNC 1
+void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
 #else
 # define FTRACE_FORCE_LIST_FUNC 0
+void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+			       struct ftrace_ops *op, struct ftrace_regs *fregs);
 #endif
+#endif /* CONFIG_FUNCTION_TRACER */
 
 /* Main tracing buffer and events set up */
 #ifdef CONFIG_TRACING
@@ -88,8 +98,6 @@ extern int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
 		     void *buffer, size_t *lenp, loff_t *ppos);
 
-struct ftrace_ops;
-
 #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
 
 struct ftrace_regs {
-- 
cgit v1.2.3


From 9b84fadc444de5456ab5f5487e2108311c724c3f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 15 Oct 2021 13:42:40 -0400
Subject: tracing: Reuse logic from perf's get_recursion_context()

Instead of having branches that adds noise to the branch prediction, use
the addition logic to set the bit for the level of interrupt context that
the state is currently in. This copies the logic from perf's
get_recursion_context() function.

Link: https://lore.kernel.org/all/20211015161702.GF174703@worktop.programming.kicks-ass.net/

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_recursion.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index a9f9c5714e65..f6da7a03bff0 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -137,12 +137,13 @@ enum {
 static __always_inline int trace_get_context_bit(void)
 {
 	unsigned long pc = preempt_count();
+	unsigned char bit = 0;
 
-	if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
-		return TRACE_CTX_NORMAL;
-	else
-		return pc & NMI_MASK ? TRACE_CTX_NMI :
-			pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ;
+	bit += !!(pc & (NMI_MASK));
+	bit += !!(pc & (NMI_MASK | HARDIRQ_MASK));
+	bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+
+	return TRACE_CTX_NORMAL - bit;
 }
 
 #ifdef CONFIG_FTRACE_RECORD_RECURSION
-- 
cgit v1.2.3


From 91ebe8bcbff9d2ff21303e73bf7434f39a98b255 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 15 Oct 2021 15:01:19 -0400
Subject: tracing/perf: Add interrupt_context_level() helper

Now that there are three different instances of doing the addition trick
to the preempt_count() and NMI_MASK, HARDIRQ_MASK and SOFTIRQ_OFFSET
macros, it deserves a helper function defined in the preempt.h header.

Add the interrupt_context_level() helper and replace the three instances
that do that logic with it.

Link: https://lore.kernel.org/all/20211015142541.4badd8a9@gandalf.local.home/

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/preempt.h         | 21 +++++++++++++++++++++
 include/linux/trace_recursion.h |  7 +------
 2 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 4d244e295e85..b32e3dabe28b 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -77,6 +77,27 @@
 /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
 #include <asm/preempt.h>
 
+/**
+ * interrupt_context_level - return interrupt context level
+ *
+ * Returns the current interrupt context level.
+ *  0 - normal context
+ *  1 - softirq context
+ *  2 - hardirq context
+ *  3 - NMI context
+ */
+static __always_inline unsigned char interrupt_context_level(void)
+{
+	unsigned long pc = preempt_count();
+	unsigned char level = 0;
+
+	level += !!(pc & (NMI_MASK));
+	level += !!(pc & (NMI_MASK | HARDIRQ_MASK));
+	level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+
+	return level;
+}
+
 #define nmi_count()	(preempt_count() & NMI_MASK)
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #ifdef CONFIG_PREEMPT_RT
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index f6da7a03bff0..1d8cce02c3fb 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -136,12 +136,7 @@ enum {
 
 static __always_inline int trace_get_context_bit(void)
 {
-	unsigned long pc = preempt_count();
-	unsigned char bit = 0;
-
-	bit += !!(pc & (NMI_MASK));
-	bit += !!(pc & (NMI_MASK | HARDIRQ_MASK));
-	bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+	unsigned char bit = interrupt_context_level();
 
 	return TRACE_CTX_NORMAL - bit;
 }
-- 
cgit v1.2.3


From 15bf32398ad488c0df1cbaf16431422c87e4feea Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 12 Oct 2021 09:23:07 -0400
Subject: security: Return xattr name from security_dentry_init_security()

Right now security_dentry_init_security() only supports single security
label and is used by SELinux only. There are two users of this hook,
namely ceph and nfs.

NFS does not care about xattr name. Ceph hardcodes the xattr name to
security.selinux (XATTR_NAME_SELINUX).

I am making changes to fuse/virtiofs to send security label to virtiofsd
and I need to send xattr name as well. I also hardcoded the name of
xattr to security.selinux.

Stephen Smalley suggested that it probably is a good idea to modify
security_dentry_init_security() to also return name of xattr so that
we can avoid this hardcoding in the callers.

This patch adds a new parameter "const char **xattr_name" to
security_dentry_init_security() and LSM puts the name of xattr
too if caller asked for it (xattr_name != NULL).

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Acked-by: James Morris <jamorris@linux.microsoft.com>
[PM: fixed typos in the commit description]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 3 ++-
 include/linux/lsm_hooks.h     | 3 +++
 include/linux/security.h      | 6 ++++--
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 4c7ed0268ce3..a9ac70ae01ab 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -83,7 +83,8 @@ LSM_HOOK(int, 0, sb_add_mnt_opt, const char *option, const char *val,
 LSM_HOOK(int, 0, move_mount, const struct path *from_path,
 	 const struct path *to_path)
 LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry,
-	 int mode, const struct qstr *name, void **ctx, u32 *ctxlen)
+	 int mode, const struct qstr *name, const char **xattr_name,
+	 void **ctx, u32 *ctxlen)
 LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode,
 	 struct qstr *name, const struct cred *old, struct cred *new)
 
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 528554e9b90c..0bada4df23fc 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -196,6 +196,9 @@
  *	@dentry dentry to use in calculating the context.
  *	@mode mode used to determine resource type.
  *	@name name of the last path component used to create file
+ *	@xattr_name pointer to place the pointer to security xattr name.
+ *		    Caller does not have to free the resulting pointer. Its
+ *		    a pointer to static string.
  *	@ctx pointer to place the pointer to the resulting context in.
  *	@ctxlen point to place the length of the resulting context.
  * @dentry_create_files_as:
diff --git a/include/linux/security.h b/include/linux/security.h
index cc6d39358336..7e0ba63b5dde 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -317,8 +317,9 @@ int security_add_mnt_opt(const char *option, const char *val,
 				int len, void **mnt_opts);
 int security_move_mount(const struct path *from_path, const struct path *to_path);
 int security_dentry_init_security(struct dentry *dentry, int mode,
-					const struct qstr *name, void **ctx,
-					u32 *ctxlen);
+				  const struct qstr *name,
+				  const char **xattr_name, void **ctx,
+				  u32 *ctxlen);
 int security_dentry_create_files_as(struct dentry *dentry, int mode,
 					struct qstr *name,
 					const struct cred *old,
@@ -739,6 +740,7 @@ static inline void security_inode_free(struct inode *inode)
 static inline int security_dentry_init_security(struct dentry *dentry,
 						 int mode,
 						 const struct qstr *name,
+						 const char **xattr_name,
 						 void **ctx,
 						 u32 *ctxlen)
 {
-- 
cgit v1.2.3


From cf6d6238cdd319eca404756dee05bf55a748b6a9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Oct 2021 22:24:10 +0100
Subject: block: turn macro helpers into inline functions

Replace bio_set_dev() with an identical inline helper and move it
further to fix a dependency problem with bio_associate_blkg(). Do the
same for bio_copy_dev().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9538f20ffaa5..b12453d7b8a8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -430,22 +430,6 @@ void zero_fill_bio(struct bio *bio);
 
 extern const char *bio_devname(struct bio *bio, char *buffer);
 
-#define bio_set_dev(bio, bdev) 				\
-do {							\
-	bio_clear_flag(bio, BIO_REMAPPED);		\
-	if ((bio)->bi_bdev != (bdev))			\
-		bio_clear_flag(bio, BIO_THROTTLED);	\
-	(bio)->bi_bdev = (bdev);			\
-	bio_associate_blkg(bio);			\
-} while (0)
-
-#define bio_copy_dev(dst, src)			\
-do {						\
-	bio_clear_flag(dst, BIO_REMAPPED);		\
-	(dst)->bi_bdev = (src)->bi_bdev;	\
-	bio_clone_blkg_association(dst, src);	\
-} while (0)
-
 #define bio_dev(bio) \
 	disk_devt((bio)->bi_bdev->bd_disk)
 
@@ -463,6 +447,22 @@ static inline void bio_clone_blkg_association(struct bio *dst,
 					      struct bio *src) { }
 #endif	/* CONFIG_BLK_CGROUP */
 
+static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
+{
+	bio_clear_flag(bio, BIO_REMAPPED);
+	if (bio->bi_bdev != bdev)
+		bio_clear_flag(bio, BIO_THROTTLED);
+	bio->bi_bdev = bdev;
+	bio_associate_blkg(bio);
+}
+
+static inline void bio_copy_dev(struct bio *dst, struct bio *src)
+{
+	bio_clear_flag(dst, BIO_REMAPPED);
+	dst->bi_bdev = src->bi_bdev;
+	bio_clone_blkg_association(dst, src);
+}
+
 /*
  * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
  *
-- 
cgit v1.2.3


From c809084ab033a8d4ee404e2ac3c5d3dc80cb65f7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Oct 2021 22:24:14 +0100
Subject: block: inline a part of bio_release_pages()

Inline BIO_NO_PAGE_REF check of bio_release_pages() to avoid function
call.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index b12453d7b8a8..c88700d1bdc3 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -417,7 +417,7 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
 void __bio_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
-void bio_release_pages(struct bio *bio, bool mark_dirty);
+void __bio_release_pages(struct bio *bio, bool mark_dirty);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 
@@ -428,6 +428,12 @@ extern void bio_free_pages(struct bio *bio);
 void guard_bio_eod(struct bio *bio);
 void zero_fill_bio(struct bio *bio);
 
+static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
+{
+	if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+		__bio_release_pages(bio, mark_dirty);
+}
+
 extern const char *bio_devname(struct bio *bio, char *buffer);
 
 #define bio_dev(bio) \
-- 
cgit v1.2.3


From dbb6f764a079d1dea883c6f2439d91db4f0fb2f2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 20 Oct 2021 16:41:17 +0200
Subject: blk-mq: move blk_mq_flush_plug_list to block/blk-mq.h

This helper is internal to the block layer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211020144119.142582-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 6cf35de151a9..e13780236550 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -656,8 +656,6 @@ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
 		unsigned int set_flags);
 void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 
-void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
-
 void blk_mq_free_request(struct request *rq);
 
 bool blk_mq_queue_inflight(struct request_queue *q);
-- 
cgit v1.2.3


From 008f75a20e7072d0840ec323c39b42206f3fa8a0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 20 Oct 2021 16:41:19 +0200
Subject: block: cleanup the flush plug helpers

Consolidate the various helpers into a single blk_flush_plug helper that
takes a plk_plug and the from_scheduler bool and switch all callsites to
call it directly.  Checks that the plug is non-NULL must be performed by
the caller, something that most already do anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211020144119.142582-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2b22fa36e568..c7b1e9355123 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -725,9 +725,8 @@ extern void blk_set_queue_dying(struct request_queue *);
  * as the lock contention for request_queue lock is reduced.
  *
  * It is ok not to disable preemption when adding the request to the plug list
- * or when attempting a merge, because blk_schedule_flush_list() will only flush
- * the plug list when the task sleeps by itself. For details, please see
- * schedule() where blk_schedule_flush_plug() is called.
+ * or when attempting a merge. For details, please see schedule() where
+ * blk_flush_plug() is called.
  */
 struct blk_plug {
 	struct request *mq_list; /* blk-mq requests */
@@ -757,23 +756,8 @@ extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
 extern void blk_start_plug(struct blk_plug *);
 extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
 extern void blk_finish_plug(struct blk_plug *);
-extern void blk_flush_plug_list(struct blk_plug *, bool);
 
-static inline void blk_flush_plug(struct task_struct *tsk)
-{
-	struct blk_plug *plug = tsk->plug;
-
-	if (plug)
-		blk_flush_plug_list(plug, false);
-}
-
-static inline void blk_schedule_flush_plug(struct task_struct *tsk)
-{
-	struct blk_plug *plug = tsk->plug;
-
-	if (plug)
-		blk_flush_plug_list(plug, true);
-}
+void blk_flush_plug(struct blk_plug *plug, bool from_schedule);
 
 static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 {
@@ -802,15 +786,10 @@ static inline void blk_finish_plug(struct blk_plug *plug)
 {
 }
 
-static inline void blk_flush_plug(struct task_struct *task)
-{
-}
-
-static inline void blk_schedule_flush_plug(struct task_struct *task)
+static inline void blk_flush_plug(struct blk_plug *plug, bool async)
 {
 }
 
-
 static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 {
 	return false;
-- 
cgit v1.2.3


From 55df0933be74bd2e52aba0b67eb743ae0feabe7e Mon Sep 17 00:00:00 2001
From: Imran Khan <imran.f.khan@oracle.com>
Date: Wed, 20 Oct 2021 14:09:00 +1100
Subject: workqueue: Introduce show_one_worker_pool and show_one_workqueue.

Currently show_workqueue_state shows the state of all workqueues and of
all worker pools. In certain cases we may need to dump state of only a
specific workqueue or worker pool. For example in destroy_workqueue we
only need to show state of the workqueue which is getting destroyed.

So rename show_workqueue_state to show_all_workqueues(to signify it
dumps state of all busy workqueues) and divide it into more granular
functions (show_one_workqueue and show_one_worker_pool), that would show
states of individual workqueues and worker pools and can be used in
cases such as the one mentioned above.

Also, as mentioned earlier, make destroy_workqueue dump data pertaining
to only the workqueue that is being destroyed and make user(s) of
earlier interface(show_workqueue_state), use new interface
(show_all_workqueues).

Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 74d3c1efd9bb..7fee9b6cfede 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -469,7 +469,8 @@ extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
 extern unsigned int work_busy(struct work_struct *work);
 extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
 extern void print_worker_info(const char *log_lvl, struct task_struct *task);
-extern void show_workqueue_state(void);
+extern void show_all_workqueues(void);
+extern void show_one_workqueue(struct workqueue_struct *wq);
 extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task);
 
 /**
-- 
cgit v1.2.3


From f783484381ad088490a497abb4faab47511774c5 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:35 +0200
Subject: mfd: ti_am335x_tscadc: Use driver data

So far every sub-cell parameter in this driver was hardcoded: cell name,
cell compatible, specific clock name and desired clock frequency.

As we are about to introduce support for ADC1/magnetic reader, we need a
bit of flexibility. Let's add a driver data structure which will contain
these information.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-18-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index ffc091b77633..bb3b56ade3fb 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -162,11 +162,20 @@
 
 #define TSCADC_CELLS		2
 
+struct ti_tscadc_data {
+	char *adc_feature_name;
+	char *adc_feature_compatible;
+	char *secondary_feature_name;
+	char *secondary_feature_compatible;
+	unsigned int target_clk_rate;
+};
+
 struct ti_tscadc_dev {
 	struct device *dev;
 	struct regmap *regmap;
 	void __iomem *tscadc_base;
 	phys_addr_t tscadc_phys_base;
+	const struct ti_tscadc_data *data;
 	int irq;
 	int used_cells;	/* 1-2 */
 	int tsc_wires;
-- 
cgit v1.2.3


From 7c605802f33176d872ffb6a3830ba4d88d9f21f6 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:37 +0200
Subject: mfd: ti_am335x_tscadc: Drop useless variables from the driver
 structure

Keeping the count of tsc_cells and adc_cells is completely redundant, we
can derive this information from other variables. Plus, these variables
are not used anywhere else now. Let's get rid of them.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-20-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index bb3b56ade3fb..23442059d271 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -177,10 +177,7 @@ struct ti_tscadc_dev {
 	phys_addr_t tscadc_phys_base;
 	const struct ti_tscadc_data *data;
 	int irq;
-	int used_cells;	/* 1-2 */
 	int tsc_wires;
-	int tsc_cell;	/* -1 if not used */
-	int adc_cell;	/* -1 if not used */
 	struct mfd_cell cells[TSCADC_CELLS];
 	u32 reg_se_cache;
 	bool adc_waiting;
-- 
cgit v1.2.3


From b813f32030e2b70adf68f73e99853f91526aa3a1 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:40 +0200
Subject: mfd: ti_am335x_tscadc: Gather the ctrl register logic in one place

Instead of deriving in the probe and in the resume path the value of the
ctrl register, let's do it only once in the probe, save the value of
this register (all but the subsystem enable bit) in the driver's
structure and use it from the resume callback.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-23-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index 23442059d271..c9c6f0b29181 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -177,8 +177,8 @@ struct ti_tscadc_dev {
 	phys_addr_t tscadc_phys_base;
 	const struct ti_tscadc_data *data;
 	int irq;
-	int tsc_wires;
 	struct mfd_cell cells[TSCADC_CELLS];
+	u32 ctrl;
 	u32 reg_se_cache;
 	bool adc_waiting;
 	bool adc_in_use;
-- 
cgit v1.2.3


From 36782dab984a8b4ef8f4ec8c2270c56468cbbbeb Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:41 +0200
Subject: mfd: ti_am335x_tscadc: Replace the header license text with SPDX tag

Drop the text license and replace it with an equivalent SPDX license tag
identifier.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-24-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index c9c6f0b29181..2b8a5f58a3b6 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -1,21 +1,13 @@
-#ifndef __LINUX_TI_AM335X_TSCADC_MFD_H
-#define __LINUX_TI_AM335X_TSCADC_MFD_H
-
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * TI Touch Screen / ADC MFD driver
  *
  * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
+#ifndef __LINUX_TI_AM335X_TSCADC_MFD_H
+#define __LINUX_TI_AM335X_TSCADC_MFD_H
+
 #include <linux/mfd/core.h>
 
 #define REG_RAWIRQSTATUS	0x024
-- 
cgit v1.2.3


From 3831abe135566813341cc36b1493d75f6ac460c3 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:42 +0200
Subject: mfd: ti_am335x_tscadc: Fix header spacing

Harmonize the spacing within macro definitions.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-25-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index 2b8a5f58a3b6..893c474c1f8c 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -41,7 +41,7 @@
 /* Step Enable */
 #define STEPENB_MASK		(0x1FFFF << 0)
 #define STEPENB(val)		((val) << 0)
-#define ENB(val)			(1 << (val))
+#define ENB(val)		(1 << (val))
 #define STPENB_STEPENB		STEPENB(0x1FFFF)
 #define STPENB_STEPENB_TC	STEPENB(0x1FFF)
 
@@ -122,15 +122,15 @@
 #define CNTRLREG_TSCENB		BIT(7)
 
 /* FIFO READ Register */
-#define FIFOREAD_DATA_MASK (0xfff << 0)
-#define FIFOREAD_CHNLID_MASK (0xf << 16)
+#define FIFOREAD_DATA_MASK	(0xfff << 0)
+#define FIFOREAD_CHNLID_MASK	(0xf << 16)
 
 /* DMA ENABLE/CLEAR Register */
 #define DMA_FIFO0		BIT(0)
 #define DMA_FIFO1		BIT(1)
 
 /* Sequencer Status */
-#define SEQ_STATUS BIT(5)
+#define SEQ_STATUS		BIT(5)
 #define CHARGE_STEP		0x11
 
 #define ADC_CLK			3000000
@@ -150,7 +150,7 @@
  *
  * max processing time: 266431 * 308ns = 83ms(approx)
  */
-#define IDLE_TIMEOUT 83 /* milliseconds */
+#define IDLE_TIMEOUT		83 /* milliseconds */
 
 #define TSCADC_CELLS		2
 
-- 
cgit v1.2.3


From 48959fcdca8b335afa4485e71114008c81291bf9 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:43 +0200
Subject: mfd: ti_am335x_tscadc: Use the new HZ_PER_MHZ macro

Before adding another frequency with even more zeroes, use the
HZ_PER_MHZ macro to clarify the number.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-26-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index 893c474c1f8c..a85643677bef 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -9,6 +9,7 @@
 #define __LINUX_TI_AM335X_TSCADC_MFD_H
 
 #include <linux/mfd/core.h>
+#include <linux/units.h>
 
 #define REG_RAWIRQSTATUS	0x024
 #define REG_IRQSTATUS		0x028
@@ -133,7 +134,7 @@
 #define SEQ_STATUS		BIT(5)
 #define CHARGE_STEP		0x11
 
-#define ADC_CLK			3000000
+#define ADC_CLK			(3 * HZ_PER_MHZ)
 #define TOTAL_STEPS		16
 #define TOTAL_CHANNELS		8
 #define FIFO1_THRESHOLD		19
-- 
cgit v1.2.3


From 65de5532a317b22f7fb93001cd0ed494145d3f4e Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:44 +0200
Subject: mfd: ti_am335x_tscadc: Drop unused definitions from the header

The STEP ENABLE definitions are highly unclear and not used so drop them.

Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-27-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index a85643677bef..1cd8cd34f2b7 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -39,13 +39,6 @@
 /* IRQ wakeup enable */
 #define IRQWKUP_ENB		BIT(0)
 
-/* Step Enable */
-#define STEPENB_MASK		(0x1FFFF << 0)
-#define STEPENB(val)		((val) << 0)
-#define ENB(val)		(1 << (val))
-#define STPENB_STEPENB		STEPENB(0x1FFFF)
-#define STPENB_STEPENB_TC	STEPENB(0x1FFF)
-
 /* IRQ enable */
 #define IRQENB_HW_PEN		BIT(0)
 #define IRQENB_EOS		BIT(1)
-- 
cgit v1.2.3


From b7cb7bf11817129c623fa921a1d6cd86b3cb727b Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:45 +0200
Subject: mfd: ti_am335x_tscadc: Use BIT(), GENMASK() and FIELD_PREP() when
 relevant

Clean the ti_am335x_tscadc.h header by:
* converting masks to GENMASK()
* converting regular shifts to BIT()
* using FIELD_PREP() when relevant

Sometimes reorder the lines to be able to use the relevant bitmask.

Mind the s/%d/%ld/ change in a log due to the type change following the
use of FIELD_PREP() in the header.

Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-28-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 61 ++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index 1cd8cd34f2b7..ae694fa2d711 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -8,6 +8,7 @@
 #ifndef __LINUX_TI_AM335X_TSCADC_MFD_H
 #define __LINUX_TI_AM335X_TSCADC_MFD_H
 
+#include <linux/bitfield.h>
 #include <linux/mfd/core.h>
 #include <linux/units.h>
 
@@ -51,12 +52,12 @@
 #define IRQENB_PENUP		BIT(9)
 
 /* Step Configuration */
-#define STEPCONFIG_MODE_MASK	(3 << 0)
-#define STEPCONFIG_MODE(val)	((val) << 0)
+#define STEPCONFIG_MODE_MASK	GENMASK(1, 0)
+#define STEPCONFIG_MODE(val)	FIELD_PREP(STEPCONFIG_MODE_MASK, (val))
 #define STEPCONFIG_MODE_SWCNT	STEPCONFIG_MODE(1)
 #define STEPCONFIG_MODE_HWSYNC	STEPCONFIG_MODE(2)
-#define STEPCONFIG_AVG_MASK	(7 << 2)
-#define STEPCONFIG_AVG(val)	((val) << 2)
+#define STEPCONFIG_AVG_MASK	GENMASK(4, 2)
+#define STEPCONFIG_AVG(val)	FIELD_PREP(STEPCONFIG_AVG_MASK, (val))
 #define STEPCONFIG_AVG_16	STEPCONFIG_AVG(4)
 #define STEPCONFIG_XPP		BIT(5)
 #define STEPCONFIG_XNN		BIT(6)
@@ -64,43 +65,43 @@
 #define STEPCONFIG_YNN		BIT(8)
 #define STEPCONFIG_XNP		BIT(9)
 #define STEPCONFIG_YPN		BIT(10)
-#define STEPCONFIG_RFP(val)	((val) << 12)
-#define STEPCONFIG_RFP_VREFP	(0x3 << 12)
-#define STEPCONFIG_INM_MASK	(0xF << 15)
-#define STEPCONFIG_INM(val)	((val) << 15)
+#define STEPCONFIG_RFP_VREFP	GENMASK(13, 12)
+#define STEPCONFIG_RFP(val)	FIELD_PREP(STEPCONFIG_RFP_VREFP, (val))
+#define STEPCONFIG_INM_MASK	GENMASK(18, 15)
+#define STEPCONFIG_INM(val)	FIELD_PREP(STEPCONFIG_INM_MASK, (val))
 #define STEPCONFIG_INM_ADCREFM	STEPCONFIG_INM(8)
-#define STEPCONFIG_INP_MASK	(0xF << 19)
-#define STEPCONFIG_INP(val)	((val) << 19)
+#define STEPCONFIG_INP_MASK	GENMASK(22, 19)
+#define STEPCONFIG_INP(val)	FIELD_PREP(STEPCONFIG_INP_MASK, (val))
 #define STEPCONFIG_INP_AN4	STEPCONFIG_INP(4)
 #define STEPCONFIG_INP_ADCREFM	STEPCONFIG_INP(8)
 #define STEPCONFIG_FIFO1	BIT(26)
-#define STEPCONFIG_RFM(val)	((val) << 23)
-#define STEPCONFIG_RFM_VREFN	(0x3 << 23)
+#define STEPCONFIG_RFM_VREFN	GENMASK(24, 23)
+#define STEPCONFIG_RFM(val)	FIELD_PREP(STEPCONFIG_RFM_VREFN, (val))
 
 /* Delay register */
-#define STEPDELAY_OPEN_MASK	(0x3FFFF << 0)
-#define STEPDELAY_OPEN(val)	((val) << 0)
+#define STEPDELAY_OPEN_MASK	GENMASK(17, 0)
+#define STEPDELAY_OPEN(val)	FIELD_PREP(STEPDELAY_OPEN_MASK, (val))
 #define STEPCONFIG_OPENDLY	STEPDELAY_OPEN(0x098)
-#define STEPDELAY_SAMPLE_MASK	(0xFF << 24)
-#define STEPDELAY_SAMPLE(val)	((val) << 24)
+#define STEPDELAY_SAMPLE_MASK	GENMASK(31, 24)
+#define STEPDELAY_SAMPLE(val)	FIELD_PREP(STEPDELAY_SAMPLE_MASK, (val))
 #define STEPCONFIG_SAMPLEDLY	STEPDELAY_SAMPLE(0)
 
 /* Charge Config */
-#define STEPCHARGE_RFP_MASK	(7 << 12)
-#define STEPCHARGE_RFP(val)	((val) << 12)
+#define STEPCHARGE_RFP_MASK	GENMASK(14, 12)
+#define STEPCHARGE_RFP(val)	FIELD_PREP(STEPCHARGE_RFP_MASK, (val))
 #define STEPCHARGE_RFP_XPUL	STEPCHARGE_RFP(1)
-#define STEPCHARGE_INM_MASK	(0xF << 15)
-#define STEPCHARGE_INM(val)	((val) << 15)
+#define STEPCHARGE_INM_MASK	GENMASK(18, 15)
+#define STEPCHARGE_INM(val)	FIELD_PREP(STEPCHARGE_INM_MASK, (val))
 #define STEPCHARGE_INM_AN1	STEPCHARGE_INM(1)
-#define STEPCHARGE_INP_MASK	(0xF << 19)
-#define STEPCHARGE_INP(val)	((val) << 19)
-#define STEPCHARGE_RFM_MASK	(3 << 23)
-#define STEPCHARGE_RFM(val)	((val) << 23)
+#define STEPCHARGE_INP_MASK	GENMASK(22, 19)
+#define STEPCHARGE_INP(val)	FIELD_PREP(STEPCHARGE_INP_MASK, (val))
+#define STEPCHARGE_RFM_MASK	GENMASK(24, 23)
+#define STEPCHARGE_RFM(val)	FIELD_PREP(STEPCHARGE_RFM_MASK, (val))
 #define STEPCHARGE_RFM_XNUR	STEPCHARGE_RFM(1)
 
 /* Charge delay */
-#define CHARGEDLY_OPEN_MASK	(0x3FFFF << 0)
-#define CHARGEDLY_OPEN(val)	((val) << 0)
+#define CHARGEDLY_OPEN_MASK	GENMASK(17, 0)
+#define CHARGEDLY_OPEN(val)	FIELD_PREP(CHARGEDLY_OPEN_MASK, (val))
 #define CHARGEDLY_OPENDLY	CHARGEDLY_OPEN(0x400)
 
 /* Control register */
@@ -108,16 +109,16 @@
 #define CNTRLREG_STEPID		BIT(1)
 #define CNTRLREG_STEPCONFIGWRT	BIT(2)
 #define CNTRLREG_POWERDOWN	BIT(4)
-#define CNTRLREG_AFE_CTRL_MASK	(3 << 5)
-#define CNTRLREG_AFE_CTRL(val)	((val) << 5)
+#define CNTRLREG_AFE_CTRL_MASK	GENMASK(6, 5)
+#define CNTRLREG_AFE_CTRL(val)	FIELD_PREP(CNTRLREG_AFE_CTRL_MASK, (val))
 #define CNTRLREG_4WIRE		CNTRLREG_AFE_CTRL(1)
 #define CNTRLREG_5WIRE		CNTRLREG_AFE_CTRL(2)
 #define CNTRLREG_8WIRE		CNTRLREG_AFE_CTRL(3)
 #define CNTRLREG_TSCENB		BIT(7)
 
 /* FIFO READ Register */
-#define FIFOREAD_DATA_MASK	(0xfff << 0)
-#define FIFOREAD_CHNLID_MASK	(0xf << 16)
+#define FIFOREAD_DATA_MASK	GENMASK(11, 0)
+#define FIFOREAD_CHNLID_MASK	GENMASK(19, 16)
 
 /* DMA ENABLE/CLEAR Register */
 #define DMA_FIFO0		BIT(0)
-- 
cgit v1.2.3


From 01d838164b4c305c1cafb0c3f71fb0027d99358b Mon Sep 17 00:00:00 2001
From: Saurav Kashyap <skashyap@marvell.com>
Date: Mon, 23 Aug 2021 05:56:48 -0700
Subject: nvme-fc: add support for ->map_queues

NVMe FC don't have support for map queues, unlike the PCI, RDMA and TCP
transports.  Add a ->map_queues callout for the LLDDs to provide such
functionality.

Signed-off-by: Saurav Kashyap <skashyap@marvell.com>
Signed-off-by: Nilesh Javali <njavali@marvell.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme-fc-driver.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 2a38f2b477a5..cb909edb76c4 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -7,6 +7,7 @@
 #define _NVME_FC_DRIVER_H 1
 
 #include <linux/scatterlist.h>
+#include <linux/blk-mq.h>
 
 
 /*
@@ -497,6 +498,8 @@ struct nvme_fc_port_template {
 	int	(*xmt_ls_rsp)(struct nvme_fc_local_port *localport,
 				struct nvme_fc_remote_port *rport,
 				struct nvmefc_ls_rsp *ls_rsp);
+	void	(*map_queues)(struct nvme_fc_local_port *localport,
+			      struct blk_mq_queue_map *map);
 
 	u32	max_hw_queues;
 	u16	max_sgl_segments;
@@ -779,6 +782,10 @@ struct nvmet_fc_target_port {
  *       LS received.
  *       Entrypoint is Mandatory.
  *
+ * @map_queues: This functions lets the driver expose the queue mapping
+ *	 to the block layer.
+ *       Entrypoint is Optional.
+ *
  * @fcp_op:  Called to perform a data transfer or transmit a response.
  *       The nvmefc_tgt_fcp_req structure is the same LLDD-supplied
  *       exchange structure specified in the nvmet_fc_rcv_fcp_req() call
-- 
cgit v1.2.3


From 44c3c6257e99c6284f312206de73783575fc8906 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 23 Sep 2021 00:55:35 +0300
Subject: nvme-rdma: limit the maximal queue size for RDMA controllers

Corrent limit of 1024 isn't valid for some of the RDMA based ctrls. In
case the target expose a cap of larger amount of entries (e.g. 1024),
the initiator may fail to create a QP with this size. Thus limit to a
value that works for all RDMA adapters.

Future general solution should use RDMA/core API to calculate this size
according to device capabilities and number of WRs needed per NVMe IO
request.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme-rdma.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
index 3ec8e50efa16..4dd7e6fe92fb 100644
--- a/include/linux/nvme-rdma.h
+++ b/include/linux/nvme-rdma.h
@@ -6,6 +6,8 @@
 #ifndef _LINUX_NVME_RDMA_H
 #define _LINUX_NVME_RDMA_H
 
+#define NVME_RDMA_MAX_QUEUE_SIZE	128
+
 enum nvme_rdma_cm_fmt {
 	NVME_RDMA_CM_FMT_1_0 = 0x0,
 };
-- 
cgit v1.2.3


From e15a8a9755659ff5972f30de4dd64867c97f242d Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Wed, 22 Sep 2021 08:35:20 +0200
Subject: nvme: add CNTRLTYPE definitions for 'identify controller'

Update the 'identify controller' structure to define the newly added
CNTRLTYPE field.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b7c4c4130b65..ed2428918bca 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -31,6 +31,12 @@ enum nvme_subsys_type {
 	NVME_NQN_NVME	= 2,		/* NVME type target subsystem */
 };
 
+enum nvme_ctrl_type {
+	NVME_CTRL_IO	= 1,		/* I/O controller */
+	NVME_CTRL_DISC	= 2,		/* Discovery controller */
+	NVME_CTRL_ADMIN	= 3,		/* Administrative controller */
+};
+
 /* Address Family codes for Discovery Log Page entry ADRFAM field */
 enum {
 	NVMF_ADDR_FAMILY_PCI	= 0,	/* PCIe */
@@ -244,7 +250,9 @@ struct nvme_id_ctrl {
 	__le32			rtd3e;
 	__le32			oaes;
 	__le32			ctratt;
-	__u8			rsvd100[28];
+	__u8			rsvd100[11];
+	__u8			cntrltype;
+	__u8			fguid[16];
 	__le16			crdt1;
 	__le16			crdt2;
 	__le16			crdt3;
-- 
cgit v1.2.3


From d56ae18f063e38eba47550c632519c9fe3f76b19 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 23 Sep 2021 13:17:44 +0300
Subject: nvmet: use macro definitions for setting cmic value

This makes the code more readable.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index ed2428918bca..357482dedb59 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -320,6 +320,7 @@ struct nvme_id_ctrl {
 };
 
 enum {
+	NVME_CTRL_CMIC_MULTI_PORT		= 1 << 0,
 	NVME_CTRL_CMIC_MULTI_CTRL		= 1 << 1,
 	NVME_CTRL_CMIC_ANA			= 1 << 3,
 	NVME_CTRL_ONCS_COMPARE			= 1 << 0,
-- 
cgit v1.2.3


From cdd591fc86e38ad3899196066219fbbd845f3162 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 5 Jul 2021 17:26:28 +0200
Subject: iov_iter: Introduce fault_in_iov_iter_writeable

Introduce a new fault_in_iov_iter_writeable helper for safely faulting
in an iterator for writing.  Uses get_user_pages() to fault in the pages
without actually writing to them, which would be destructive.

We'll use fault_in_iov_iter_writeable in gfs2 once we've determined that
the iterator passed to .read_iter isn't in memory.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 include/linux/pagemap.h | 1 +
 include/linux/uio.h     | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 9fe94f7a4f7e..2f7dd14083d9 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -736,6 +736,7 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
  * Fault in userspace address range.
  */
 size_t fault_in_writeable(char __user *uaddr, size_t size);
+size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
 size_t fault_in_readable(const char __user *uaddr, size_t size);
 
 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
diff --git a/include/linux/uio.h b/include/linux/uio.h
index d18458af6681..25d1c24fd829 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -134,6 +134,7 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
 void iov_iter_revert(struct iov_iter *i, size_t bytes);
 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
+size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i);
-- 
cgit v1.2.3


From a164ff53cbd34479aeac3366840669b10845ce53 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 14 Oct 2021 16:47:54 +0300
Subject: driver core: Provide device_match_acpi_handle() helper

We have a couple of users of this helper, make it available for them.

The prototype for the helper is specifically crafted in order to be
easily used with bus_find_device() call. That's why its location is
in the driver core rather than ACPI.

Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20211014134756.39092-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/bus.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index 062777a45a74..a039ab809753 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -143,6 +143,7 @@ int device_match_of_node(struct device *dev, const void *np);
 int device_match_fwnode(struct device *dev, const void *fwnode);
 int device_match_devt(struct device *dev, const void *pdevt);
 int device_match_acpi_dev(struct device *dev, const void *adev);
+int device_match_acpi_handle(struct device *dev, const void *handle);
 int device_match_any(struct device *dev, const void *unused);
 
 /* iterator helpers for buses */
-- 
cgit v1.2.3


From f1985002839af80d6c84e9537834a81fb1364d6e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 20 Oct 2021 09:33:21 +0100
Subject: irqchip: Provide stronger type checking for
 IRQCHIP_MATCH/IRQCHIP_DECLARE

Both IRQCHIP_DECLARE() and IRQCHIP_MATCH() use an underlying of_device_id()
structure to encode the matching property and the init callback.
However, this callback is stored in as a void * pointer, which obviously
defeat any attempt at stronger type checking.

Work around this by providing a new macro that builds on top of the
__typecheck() primitive, and that can be used to warn when there is
a discrepency between the drivers and core code.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211020104527.3066268-1-maz@kernel.org
---
 include/linux/irqchip.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h
index 67351aac65ef..5de0dfc5d64d 100644
--- a/include/linux/irqchip.h
+++ b/include/linux/irqchip.h
@@ -14,8 +14,15 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_irq.h>
 #include <linux/platform_device.h>
 
+/* Undefined on purpose */
+extern of_irq_init_cb_t typecheck_irq_init_cb;
+
+#define typecheck_irq_init_cb(fn)					\
+	(__typecheck(typecheck_irq_init_cb, &fn) ? fn : fn)
+
 /*
  * This macro must be used by the different irqchip drivers to declare
  * the association between their DT compatible string and their
@@ -26,14 +33,16 @@
  * @compstr: compatible string of the irqchip driver
  * @fn: initialization function
  */
-#define IRQCHIP_DECLARE(name, compat, fn) OF_DECLARE_2(irqchip, name, compat, fn)
+#define IRQCHIP_DECLARE(name, compat, fn)	\
+	OF_DECLARE_2(irqchip, name, compat, typecheck_irq_init_cb(fn))
 
 extern int platform_irqchip_probe(struct platform_device *pdev);
 
 #define IRQCHIP_PLATFORM_DRIVER_BEGIN(drv_name) \
 static const struct of_device_id drv_name##_irqchip_match_table[] = {
 
-#define IRQCHIP_MATCH(compat, fn) { .compatible = compat, .data = fn },
+#define IRQCHIP_MATCH(compat, fn) { .compatible = compat,		\
+				    .data = typecheck_irq_init_cb(fn), },
 
 #define IRQCHIP_PLATFORM_DRIVER_END(drv_name)				\
 	{},								\
-- 
cgit v1.2.3


From 795e92ec5fd79027648bd7f779d34bad5b6f2f55 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 6 Oct 2021 11:43:21 -0500
Subject: of: Add of_get_cpu_hwid() to read hardware ID from CPU nodes

There are various open coded implementions parsing the CPU node 'reg'
property which contains the CPU's hardware ID. Introduce a new function,
of_get_cpu_hwid(), to read the hardware ID.

All the callers should be DT only code, so no need for an empty
function.

Cc: Frank Rowand <frowand.list@gmail.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/20211006164332.1981454-2-robh@kernel.org
---
 include/linux/of.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index 6f1c41f109bb..807f8168dad9 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -353,6 +353,7 @@ extern struct device_node *of_get_cpu_node(int cpu, unsigned int *thread);
 extern struct device_node *of_get_next_cpu_node(struct device_node *prev);
 extern struct device_node *of_get_cpu_state_node(struct device_node *cpu_node,
 						 int index);
+extern u64 of_get_cpu_hwid(struct device_node *cpun, unsigned int thread);
 
 #define for_each_property_of_node(dn, pp) \
 	for (pp = dn->properties; pp != NULL; pp = pp->next)
-- 
cgit v1.2.3


From a3c85b2ee098c4a293148fab4eb96299e92b9524 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Thu, 14 Oct 2021 12:30:55 -0500
Subject: of: make of_node_check_flag() device_node parameter const

The device_node argument isn't modified by of_node_check_flag(), so mark it
const.

Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Link: https://lore.kernel.org/r/20211014173055.2117872-1-nathanl@linux.ibm.com
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index 807f8168dad9..ff143a027abc 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -185,7 +185,7 @@ static inline bool of_node_is_root(const struct device_node *node)
 	return node && (node->parent == NULL);
 }
 
-static inline int of_node_check_flag(struct device_node *n, unsigned long flag)
+static inline int of_node_check_flag(const struct device_node *n, unsigned long flag)
 {
 	return test_bit(flag, &n->_flags);
 }
-- 
cgit v1.2.3


From b8419e7be6c6029eee3448fda45f4f9ad340c4ca Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 20 Oct 2021 11:48:59 -0700
Subject: irqchip: Fix kernel-doc parameter typo for IRQCHIP_DECLARE

The documentation refers to "compstr" when we have the parameter named
"compat", fix the typo.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211020184859.2705451-14-f.fainelli@gmail.com
---
 include/linux/irqchip.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h
index 5de0dfc5d64d..7f007b9c23f8 100644
--- a/include/linux/irqchip.h
+++ b/include/linux/irqchip.h
@@ -30,7 +30,7 @@ extern of_irq_init_cb_t typecheck_irq_init_cb;
  *
  * @name: name that must be unique across all IRQCHIP_DECLARE of the
  * same file.
- * @compstr: compatible string of the irqchip driver
+ * @compat: compatible string of the irqchip driver
  * @fn: initialization function
  */
 #define IRQCHIP_DECLARE(name, compat, fn)	\
-- 
cgit v1.2.3


From 133a48abf6ecc535d7eddc6da1c3e4c972445882 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 4 Oct 2021 15:37:42 -0400
Subject: NFS: Fix up commit deadlocks

If O_DIRECT bumps the commit_info rpcs_out field, then that could lead
to fsync() hangs. The fix is to ensure that O_DIRECT calls
nfs_commit_end().

Fixes: 723c921e7dfc ("sched/wait, fs/nfs: Convert wait_on_atomic_t() usage to the new wait_var_event() API")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index ca547cc5458c..e58b78da7a98 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -568,6 +568,7 @@ extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 extern int  nfs_commit_inode(struct inode *, int);
 extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail);
 extern void nfs_commit_free(struct nfs_commit_data *data);
+bool nfs_commit_end(struct nfs_mds_commit_info *cinfo);
 
 static inline int
 nfs_have_writebacks(struct inode *inode)
-- 
cgit v1.2.3


From 0ebeebcf59601bcfa0284f4bb7abdec051eb856d Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Sun, 10 Oct 2021 18:23:13 -0400
Subject: NFS: Fix WARN_ON due to unionization of nfs_inode.nrequests

Fixes the following WARN_ON
WARNING: CPU: 2 PID: 18678 at fs/nfs/inode.c:123 nfs_clear_inode+0x3b/0x50 [nfs]
...
Call Trace:
  nfs4_evict_inode+0x57/0x70 [nfsv4]
  evict+0xd1/0x180
  dispose_list+0x48/0x60
  evict_inodes+0x156/0x190
  generic_shutdown_super+0x37/0x110
  nfs_kill_super+0x1d/0x40 [nfs]
  deactivate_locked_super+0x36/0xa0

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index e58b78da7a98..457b866a2d9e 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -573,7 +573,9 @@ bool nfs_commit_end(struct nfs_mds_commit_info *cinfo);
 static inline int
 nfs_have_writebacks(struct inode *inode)
 {
-	return atomic_long_read(&NFS_I(inode)->nrequests) != 0;
+	if (S_ISREG(inode->i_mode))
+		return atomic_long_read(&NFS_I(inode)->nrequests) != 0;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From e591b298d7ecb851e200f65946e3d53fe78a3c4f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 28 Sep 2021 17:41:41 -0400
Subject: NFS: Save some space in the inode

Save some space in the nfs_inode by setting up an anonymous union with
the fields that are peculiar to a specific type of filesystem object.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 457b866a2d9e..739ca1ef934f 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -155,33 +155,39 @@ struct nfs_inode {
 	unsigned long		attrtimeo_timestamp;
 
 	unsigned long		attr_gencount;
-	/* "Generation counter" for the attribute cache. This is
-	 * bumped whenever we update the metadata on the
-	 * server.
-	 */
-	unsigned long		cache_change_attribute;
 
 	struct rb_root		access_cache;
 	struct list_head	access_cache_entry_lru;
 	struct list_head	access_cache_inode_lru;
 
-	/*
-	 * This is the cookie verifier used for NFSv3 readdir
-	 * operations
-	 */
-	__be32			cookieverf[NFS_DIR_VERIFIER_SIZE];
-
-	atomic_long_t		nrequests;
-	struct nfs_mds_commit_info commit_info;
+	union {
+		/* Directory */
+		struct {
+			/* "Generation counter" for the attribute cache.
+			 * This is bumped whenever we update the metadata
+			 * on the server.
+			 */
+			unsigned long	cache_change_attribute;
+			/*
+			 * This is the cookie verifier used for NFSv3 readdir
+			 * operations
+			 */
+			__be32		cookieverf[NFS_DIR_VERIFIER_SIZE];
+			/* Readers: in-flight sillydelete RPC calls */
+			/* Writers: rmdir */
+			struct rw_semaphore	rmdir_sem;
+		};
+		/* Regular file */
+		struct {
+			atomic_long_t	nrequests;
+			struct nfs_mds_commit_info commit_info;
+			struct mutex	commit_mutex;
+		};
+	};
 
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
 
-	/* Readers: in-flight sillydelete RPC calls */
-	/* Writers: rmdir */
-	struct rw_semaphore	rmdir_sem;
-	struct mutex		commit_mutex;
-
 #if IS_ENABLED(CONFIG_NFS_V4)
 	struct nfs4_cached_acl	*nfs4_acl;
         /* NFSv4 state */
-- 
cgit v1.2.3


From 0c0593b45c9b4e5b212ffb3fb28bb8d3c0ec0dc8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 8 Oct 2021 11:13:31 +0200
Subject: x86/ftrace: Make function graph use ftrace directly

We don't need special hook for graph tracer entry point,
but instead we can use graph_ops::func function to install
the return_hooker.

This moves the graph tracing setup _before_ the direct
trampoline prepares the stack, so the return_hooker will
be called when the direct trampoline is finished.

This simplifies the code, because we don't need to take into
account the direct trampoline setup when preparing the graph
tracer hooker and we can allow function graph tracer on entries
registered with direct trampoline.

Link: https://lkml.kernel.org/r/20211008091336.33616-4-jolsa@kernel.org

[fixed compile error reported by kernel test robot <lkp@intel.com>]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 12fcfa2d23ea..16a7baaba702 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -803,6 +803,15 @@ static inline bool is_ftrace_trampoline(unsigned long addr)
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#ifndef ftrace_graph_func
+#define ftrace_graph_func ftrace_stub
+#define FTRACE_OPS_GRAPH_STUB FTRACE_OPS_FL_STUB
+#else
+#define FTRACE_OPS_GRAPH_STUB 0
+#endif
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 /* totally disable ftrace - can not re-enable after this */
 void ftrace_kill(void);
 
-- 
cgit v1.2.3


From e967b60eb51116d2347093655e555aa036dd40d8 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:46 +0200
Subject: mfd: ti_am335x_tscadc: Clarify the maximum values for DT entries

Clearly define the maximum open delay and sample delay. Use these
definitions in place of a mask (which works because this is the first
field in the register) and an open-coded value. While at it reword a
little bit the error messages to make them look clearer and similar.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-29-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index ae694fa2d711..31cffb6e8b17 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -84,7 +84,9 @@
 #define STEPCONFIG_OPENDLY	STEPDELAY_OPEN(0x098)
 #define STEPDELAY_SAMPLE_MASK	GENMASK(31, 24)
 #define STEPDELAY_SAMPLE(val)	FIELD_PREP(STEPDELAY_SAMPLE_MASK, (val))
+#define STEPCONFIG_MAX_OPENDLY	GENMASK(17, 0)
 #define STEPCONFIG_SAMPLEDLY	STEPDELAY_SAMPLE(0)
+#define STEPCONFIG_MAX_SAMPLE	GENMASK(7, 0)
 
 /* Charge Config */
 #define STEPCHARGE_RFP_MASK	GENMASK(14, 12)
-- 
cgit v1.2.3


From 0fd12262613116a38fd76aefa396e9bc116fbfe2 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:47 +0200
Subject: mfd: ti_am335x_tscadc: Drop useless definitions from the header

Drop useless definitions from the header like the "masks" definitions
which are only used by the following definition.

It could be possible to got even further by removing these definitions
entirely and use FIELD_PREP() macros from the code directly, but while I
have no troubles making these changes in the header, changing the values
in the code directly could darkening a bit the logic and
hardening future git-blames for very little added value IMHO (but this
is of course a personal taste).

Certain macros are using GENMASK() to define the value of a particular
field, while this is purely "by chance" that the value and the mask have
the same value. In this case, drop the "mask" definition, use
FIELD_PREP() and GENMASK() in the macro defining the field, and use the
new macro to define the particular value by feeding directly the actual
number advertised in the datasheet into that macro, as in:
	-#define STEPCONFIG_RFM_VREFN   GENMASK(24, 23)
	-#define STEPCONFIG_RFM(val)    FIELD_PREP(STEPCONFIG_RFM_VREFN, (val))
	+#define STEPCONFIG_RFM(val)    FIELD_PREP(GENMASK(24, 23), (val))
	+#define STEPCONFIG_RFM_VREFN   STEPCONFIG_RFM(3)

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-30-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 44 +++++++++++++-----------------------
 1 file changed, 16 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index 31cffb6e8b17..51d987080cd3 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -52,12 +52,10 @@
 #define IRQENB_PENUP		BIT(9)
 
 /* Step Configuration */
-#define STEPCONFIG_MODE_MASK	GENMASK(1, 0)
-#define STEPCONFIG_MODE(val)	FIELD_PREP(STEPCONFIG_MODE_MASK, (val))
+#define STEPCONFIG_MODE(val)	FIELD_PREP(GENMASK(1, 0), (val))
 #define STEPCONFIG_MODE_SWCNT	STEPCONFIG_MODE(1)
 #define STEPCONFIG_MODE_HWSYNC	STEPCONFIG_MODE(2)
-#define STEPCONFIG_AVG_MASK	GENMASK(4, 2)
-#define STEPCONFIG_AVG(val)	FIELD_PREP(STEPCONFIG_AVG_MASK, (val))
+#define STEPCONFIG_AVG(val)	FIELD_PREP(GENMASK(4, 2), (val))
 #define STEPCONFIG_AVG_16	STEPCONFIG_AVG(4)
 #define STEPCONFIG_XPP		BIT(5)
 #define STEPCONFIG_XNN		BIT(6)
@@ -65,45 +63,36 @@
 #define STEPCONFIG_YNN		BIT(8)
 #define STEPCONFIG_XNP		BIT(9)
 #define STEPCONFIG_YPN		BIT(10)
-#define STEPCONFIG_RFP_VREFP	GENMASK(13, 12)
-#define STEPCONFIG_RFP(val)	FIELD_PREP(STEPCONFIG_RFP_VREFP, (val))
-#define STEPCONFIG_INM_MASK	GENMASK(18, 15)
-#define STEPCONFIG_INM(val)	FIELD_PREP(STEPCONFIG_INM_MASK, (val))
+#define STEPCONFIG_RFP(val)	FIELD_PREP(GENMASK(13, 12), (val))
+#define STEPCONFIG_RFP_VREFP	STEPCONFIG_RFP(3)
+#define STEPCONFIG_INM(val)	FIELD_PREP(GENMASK(18, 15), (val))
 #define STEPCONFIG_INM_ADCREFM	STEPCONFIG_INM(8)
-#define STEPCONFIG_INP_MASK	GENMASK(22, 19)
-#define STEPCONFIG_INP(val)	FIELD_PREP(STEPCONFIG_INP_MASK, (val))
+#define STEPCONFIG_INP(val)	FIELD_PREP(GENMASK(22, 19), (val))
 #define STEPCONFIG_INP_AN4	STEPCONFIG_INP(4)
 #define STEPCONFIG_INP_ADCREFM	STEPCONFIG_INP(8)
 #define STEPCONFIG_FIFO1	BIT(26)
-#define STEPCONFIG_RFM_VREFN	GENMASK(24, 23)
-#define STEPCONFIG_RFM(val)	FIELD_PREP(STEPCONFIG_RFM_VREFN, (val))
+#define STEPCONFIG_RFM(val)	FIELD_PREP(GENMASK(24, 23), (val))
+#define STEPCONFIG_RFM_VREFN	STEPCONFIG_RFM(3)
 
 /* Delay register */
-#define STEPDELAY_OPEN_MASK	GENMASK(17, 0)
-#define STEPDELAY_OPEN(val)	FIELD_PREP(STEPDELAY_OPEN_MASK, (val))
+#define STEPDELAY_OPEN(val)	FIELD_PREP(GENMASK(17, 0), (val))
 #define STEPCONFIG_OPENDLY	STEPDELAY_OPEN(0x098)
-#define STEPDELAY_SAMPLE_MASK	GENMASK(31, 24)
-#define STEPDELAY_SAMPLE(val)	FIELD_PREP(STEPDELAY_SAMPLE_MASK, (val))
 #define STEPCONFIG_MAX_OPENDLY	GENMASK(17, 0)
+#define STEPDELAY_SAMPLE(val)	FIELD_PREP(GENMASK(31, 24), (val))
 #define STEPCONFIG_SAMPLEDLY	STEPDELAY_SAMPLE(0)
 #define STEPCONFIG_MAX_SAMPLE	GENMASK(7, 0)
 
 /* Charge Config */
-#define STEPCHARGE_RFP_MASK	GENMASK(14, 12)
-#define STEPCHARGE_RFP(val)	FIELD_PREP(STEPCHARGE_RFP_MASK, (val))
+#define STEPCHARGE_RFP(val)	FIELD_PREP(GENMASK(14, 12), (val))
 #define STEPCHARGE_RFP_XPUL	STEPCHARGE_RFP(1)
-#define STEPCHARGE_INM_MASK	GENMASK(18, 15)
-#define STEPCHARGE_INM(val)	FIELD_PREP(STEPCHARGE_INM_MASK, (val))
+#define STEPCHARGE_INM(val)	FIELD_PREP(GENMASK(18, 15), (val))
 #define STEPCHARGE_INM_AN1	STEPCHARGE_INM(1)
-#define STEPCHARGE_INP_MASK	GENMASK(22, 19)
-#define STEPCHARGE_INP(val)	FIELD_PREP(STEPCHARGE_INP_MASK, (val))
-#define STEPCHARGE_RFM_MASK	GENMASK(24, 23)
-#define STEPCHARGE_RFM(val)	FIELD_PREP(STEPCHARGE_RFM_MASK, (val))
+#define STEPCHARGE_INP(val)	FIELD_PREP(GENMASK(22, 19), (val))
+#define STEPCHARGE_RFM(val)	FIELD_PREP(GENMASK(24, 23), (val))
 #define STEPCHARGE_RFM_XNUR	STEPCHARGE_RFM(1)
 
 /* Charge delay */
-#define CHARGEDLY_OPEN_MASK	GENMASK(17, 0)
-#define CHARGEDLY_OPEN(val)	FIELD_PREP(CHARGEDLY_OPEN_MASK, (val))
+#define CHARGEDLY_OPEN(val)	FIELD_PREP(GENMASK(17, 0), (val))
 #define CHARGEDLY_OPENDLY	CHARGEDLY_OPEN(0x400)
 
 /* Control register */
@@ -111,8 +100,7 @@
 #define CNTRLREG_STEPID		BIT(1)
 #define CNTRLREG_STEPCONFIGWRT	BIT(2)
 #define CNTRLREG_POWERDOWN	BIT(4)
-#define CNTRLREG_AFE_CTRL_MASK	GENMASK(6, 5)
-#define CNTRLREG_AFE_CTRL(val)	FIELD_PREP(CNTRLREG_AFE_CTRL_MASK, (val))
+#define CNTRLREG_AFE_CTRL(val)	FIELD_PREP(GENMASK(6, 5), (val))
 #define CNTRLREG_4WIRE		CNTRLREG_AFE_CTRL(1)
 #define CNTRLREG_5WIRE		CNTRLREG_AFE_CTRL(2)
 #define CNTRLREG_8WIRE		CNTRLREG_AFE_CTRL(3)
-- 
cgit v1.2.3


From c3e36b5d069241f3cbc0e647cf297c5a329cd7a6 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:48 +0200
Subject: mfd: ti_am335x_tscadc: Rename the subsystem enable macro

This bit is common to all devices (ADC, Touchscreen, Magnetic reader) so
make it clear that it can be used from any location by operating a
mechanical rename:
s/CNTRLREG_TSCSSENB/CNTRLREG_SSENB/

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-31-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index 51d987080cd3..860289ae8516 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -96,7 +96,7 @@
 #define CHARGEDLY_OPENDLY	CHARGEDLY_OPEN(0x400)
 
 /* Control register */
-#define CNTRLREG_TSCSSENB	BIT(0)
+#define CNTRLREG_SSENB		BIT(0)
 #define CNTRLREG_STEPID		BIT(1)
 #define CNTRLREG_STEPCONFIGWRT	BIT(2)
 #define CNTRLREG_POWERDOWN	BIT(4)
-- 
cgit v1.2.3


From 2f89c2619ce93bf2b0e6e721614fc33e8cf48f03 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:49 +0200
Subject: mfd: ti_am335x_tscadc: Add TSC prefix in certain macros

While the register list (and names) between ADC0 and ADC1 are pretty
close, the bits inside changed a little bit. To avoid any future
confusion, let's add the TSC prefix when some bits are in a register
that is common to both revisions of the ADC, but are specific to the
am33xx hardware.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-32-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index 860289ae8516..cc6de9258455 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -98,13 +98,13 @@
 /* Control register */
 #define CNTRLREG_SSENB		BIT(0)
 #define CNTRLREG_STEPID		BIT(1)
-#define CNTRLREG_STEPCONFIGWRT	BIT(2)
+#define CNTRLREG_TSC_STEPCONFIGWRT BIT(2)
 #define CNTRLREG_POWERDOWN	BIT(4)
-#define CNTRLREG_AFE_CTRL(val)	FIELD_PREP(GENMASK(6, 5), (val))
-#define CNTRLREG_4WIRE		CNTRLREG_AFE_CTRL(1)
-#define CNTRLREG_5WIRE		CNTRLREG_AFE_CTRL(2)
-#define CNTRLREG_8WIRE		CNTRLREG_AFE_CTRL(3)
-#define CNTRLREG_TSCENB		BIT(7)
+#define CNTRLREG_TSC_AFE_CTRL(val) FIELD_PREP(GENMASK(6, 5), (val))
+#define CNTRLREG_TSC_4WIRE	CNTRLREG_TSC_AFE_CTRL(1)
+#define CNTRLREG_TSC_5WIRE	CNTRLREG_TSC_AFE_CTRL(2)
+#define CNTRLREG_TSC_8WIRE	CNTRLREG_TSC_AFE_CTRL(3)
+#define CNTRLREG_TSC_ENB	BIT(7)
 
 /* FIFO READ Register */
 #define FIFOREAD_DATA_MASK	GENMASK(11, 0)
@@ -118,7 +118,7 @@
 #define SEQ_STATUS		BIT(5)
 #define CHARGE_STEP		0x11
 
-#define ADC_CLK			(3 * HZ_PER_MHZ)
+#define TSC_ADC_CLK		(3 * HZ_PER_MHZ)
 #define TOTAL_STEPS		16
 #define TOTAL_CHANNELS		8
 #define FIFO1_THRESHOLD		19
-- 
cgit v1.2.3


From bf0f394c7b1e4d623ca2afdf59b3b4b95cc6f592 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:53 +0200
Subject: mfd: ti_am335x_tscadc: Introduce a helper to deal with the type of
 hardware

One way of knowing which hardware we are dealing with is to check the
compatible string. When this must be done at several places, it's best
and certainly more clear to use a helper for that.

Introduce ti_adc_with_touchscreen() to indicate if there is a touchscreen
controller available (meaning it's an am33xx-like ADC). This helper does
not indicate if it is actually used (that is the purpose of the use_tsc
boolean).

Introducing this helper helps making a difference in the code between
what is generic to both types of ADCs and what is specific to the am33xx
hardware before introducing support for the am437x hardware.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-36-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index cc6de9258455..ee160b2036c1 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -177,6 +177,12 @@ static inline struct ti_tscadc_dev *ti_tscadc_dev_get(struct platform_device *p)
 	return *tscadc_dev;
 }
 
+static inline bool ti_adc_with_touchscreen(struct ti_tscadc_dev *tscadc)
+{
+	return of_device_is_compatible(tscadc->dev->of_node,
+				       "ti,am3359-tscadc");
+}
+
 void am335x_tsc_se_set_cache(struct ti_tscadc_dev *tsadc, u32 val);
 void am335x_tsc_se_set_once(struct ti_tscadc_dev *tsadc, u32 val);
 void am335x_tsc_se_clr(struct ti_tscadc_dev *tsadc, u32 val);
-- 
cgit v1.2.3


From 0a1233031c16d8575be6b864e6fd353b6fd758c4 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:14:54 +0200
Subject: mfd: ti_am335x_tscadc: Add ADC1/magnetic reader support

Introduce a new compatible that has another set of driver data,
targeting am437x SoCs with a magnetic reader instead of the
touchscreen and a more featureful set of registers.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-37-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index ee160b2036c1..5225e3fc194d 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -106,6 +106,11 @@
 #define CNTRLREG_TSC_8WIRE	CNTRLREG_TSC_AFE_CTRL(3)
 #define CNTRLREG_TSC_ENB	BIT(7)
 
+/*Control registers bitfields  for MAGADC IP */
+#define CNTRLREG_MAGADCENB      BIT(0)
+#define CNTRLREG_MAG_PREAMP_PWRDOWN BIT(5)
+#define CNTRLREG_MAG_PREAMP_BYPASS  BIT(6)
+
 /* FIFO READ Register */
 #define FIFOREAD_DATA_MASK	GENMASK(11, 0)
 #define FIFOREAD_CHNLID_MASK	GENMASK(19, 16)
@@ -119,6 +124,7 @@
 #define CHARGE_STEP		0x11
 
 #define TSC_ADC_CLK		(3 * HZ_PER_MHZ)
+#define MAG_ADC_CLK		(13 * HZ_PER_MHZ)
 #define TOTAL_STEPS		16
 #define TOTAL_CHANNELS		8
 #define FIFO1_THRESHOLD		19
-- 
cgit v1.2.3


From 789e5ebcc61b72a71717b02b62e6c6f10fdbb722 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Oct 2021 10:15:01 +0200
Subject: iio: adc: ti_am335x_adc: Add a unit to the timeout delay

The lack of unit in the macro name kind of tricked me when I was
troubleshooting an issue. Physical constants should always get a unit.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211015081506.933180-44-miquel.raynal@bootlin.com
---
 include/linux/mfd/ti_am335x_tscadc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index 5225e3fc194d..ba13e043d910 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -141,7 +141,7 @@
  *
  * max processing time: 266431 * 308ns = 83ms(approx)
  */
-#define IDLE_TIMEOUT		83 /* milliseconds */
+#define IDLE_TIMEOUT_MS		83 /* milliseconds */
 
 #define TSCADC_CELLS		2
 
-- 
cgit v1.2.3


From 9a48e7564ac83fb0f1d5b0eac5fe8a7af62da398 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 20 Oct 2021 13:00:39 -0700
Subject: compiler-gcc.h: Define __SANITIZE_ADDRESS__ under hwaddress sanitizer

When Clang is using the hwaddress sanitizer, it sets __SANITIZE_ADDRESS__
explicitly:

 #if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer)
 /* Emulate GCC's __SANITIZE_ADDRESS__ flag */
 #define __SANITIZE_ADDRESS__
 #endif

Once hwaddress sanitizer was added to GCC, however, a separate define
was created, __SANITIZE_HWADDRESS__. The kernel is expecting to find
__SANITIZE_ADDRESS__ in either case, though, and the existing string
macros break on supported architectures:

 #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
          !defined(__SANITIZE_ADDRESS__)

where as other architectures (like arm32) have no idea about hwaddress
sanitizer and just check for __SANITIZE_ADDRESS__:

 #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)

This would lead to compiler foritfy self-test warnings when building
with CONFIG_KASAN_SW_TAGS=y:

warning: unsafe memmove() usage lacked '__read_overflow2' symbol in lib/test_fortify/read_overflow2-memmove.c
warning: unsafe memcpy() usage lacked '__write_overflow' symbol in lib/test_fortify/write_overflow-memcpy.c
...

Sort this out by also defining __SANITIZE_ADDRESS__ in GCC under the
hwaddress sanitizer.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Arvind Sankar <nivedita@alum.mit.edu>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: llvm@lists.linux.dev
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/r/20211020200039.170424-1-keescook@chromium.org
---
 include/linux/compiler-gcc.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 9957085b8148..7bbd8df02532 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -121,6 +121,14 @@
 #define __no_sanitize_coverage
 #endif
 
+/*
+ * Treat __SANITIZE_HWADDRESS__ the same as __SANITIZE_ADDRESS__ in the kernel,
+ * matching the defines used by Clang.
+ */
+#ifdef __SANITIZE_HWADDRESS__
+#define __SANITIZE_ADDRESS__
+#endif
+
 /*
  * Turn individual warnings and errors on and off locally, depending
  * on version.
-- 
cgit v1.2.3


From 3598b30bd970bbc09dba0cf31aa0a04105f37207 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 20 Oct 2021 21:08:06 +0200
Subject: cpufreq: Fix typo in cpufreq.h

s/internale/internal/

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/cpufreq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index ff88bb3e44fc..3317887027b5 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -385,7 +385,7 @@ struct cpufreq_driver {
 /* flags */
 
 /*
- * Set by drivers that need to update internale upper and lower boundaries along
+ * Set by drivers that need to update internal upper and lower boundaries along
  * with the target frequency and so the core and governors should also invoke
  * the diver if the target frequency does not change, but the policy min or max
  * may have changed.
-- 
cgit v1.2.3


From e279317e9aeb11d8670e0a5acb10d50566eea9c9 Mon Sep 17 00:00:00 2001
From: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Date: Fri, 15 Oct 2021 11:47:00 +0200
Subject: rpmsg: core: add API to get MTU

Return the rpmsg buffer MTU for sending message, so rpmsg users
can split a long message in several sub rpmsg buffers.

Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Acked-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Link: https://lore.kernel.org/r/20211015094701.5732-2-arnaud.pouliquen@foss.st.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/rpmsg.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index d97dcd049f18..990b80fb49ad 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -186,6 +186,8 @@ int rpmsg_trysend_offchannel(struct rpmsg_endpoint *ept, u32 src, u32 dst,
 __poll_t rpmsg_poll(struct rpmsg_endpoint *ept, struct file *filp,
 			poll_table *wait);
 
+ssize_t rpmsg_get_mtu(struct rpmsg_endpoint *ept);
+
 #else
 
 static inline int rpmsg_register_device(struct rpmsg_device *rpdev)
@@ -296,6 +298,14 @@ static inline __poll_t rpmsg_poll(struct rpmsg_endpoint *ept,
 	return 0;
 }
 
+static inline ssize_t rpmsg_get_mtu(struct rpmsg_endpoint *ept)
+{
+	/* This shouldn't be possible */
+	WARN_ON(1);
+
+	return -ENXIO;
+}
+
 #endif /* IS_ENABLED(CONFIG_RPMSG) */
 
 /* use a macro to avoid include chaining to get THIS_MODULE */
-- 
cgit v1.2.3


From 8ac33b8b6841e99a624ace543d92cbf598a91381 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Thu, 21 Oct 2021 19:35:40 +0900
Subject: counter: Fix use-after-free race condition for events_queue_size
 write

A race condition is possible when writing to events_queue_size where the
events kfifo is freed during the execution of a kfifo_in(), resulting in
a use-after-free. This patch prevents such a scenario by protecting the
events queue in operation with a spinlock and locking before performing
the events queue size adjustment.

The existing events_lock mutex is renamed to events_out_lock to reflect
that it only protects events queue out operations. Because the events
queue in operations can occur in an interrupt context, a new
events_in_lock spinlock is introduced and utilized.

Fixes: feff17a550c7 ("counter: Implement events_queue_size sysfs attribute")
Cc: David Lechner <david@lechnology.com>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Link: https://lore.kernel.org/r/20211021103540.955639-1-vilhelm.gray@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/counter.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index 0fd99e255a50..b7d0a00a61cf 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -296,7 +296,8 @@ struct counter_ops {
  * @n_events_list_lock:	lock to protect Counter next events list operations
  * @events:		queue of detected Counter events
  * @events_wait:	wait queue to allow blocking reads of Counter events
- * @events_lock:	lock to protect Counter events queue read operations
+ * @events_in_lock:	lock to protect Counter events queue in operations
+ * @events_out_lock:	lock to protect Counter events queue out operations
  * @ops_exist_lock:	lock to prevent use during removal
  */
 struct counter_device {
@@ -323,7 +324,8 @@ struct counter_device {
 	struct mutex n_events_list_lock;
 	DECLARE_KFIFO_PTR(events, struct counter_event);
 	wait_queue_head_t events_wait;
-	struct mutex events_lock;
+	spinlock_t events_in_lock;
+	struct mutex events_out_lock;
 	struct mutex ops_exist_lock;
 };
 
-- 
cgit v1.2.3


From 992e5cc7be8e693aba1f0ee7905d34c87fd7872a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 20 Oct 2021 20:49:55 +0300
Subject: net: dsa: tag_8021q: make dsa_8021q_{rx,tx}_vid take dp as argument

Pass a single argument to dsa_8021q_rx_vid and dsa_8021q_tx_vid that
contains the necessary information from the two arguments that are
currently provided: the switch and the port number.

Also rename those functions so that they have a dsa_port_* prefix, since
they operate on a struct dsa_port *.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/8021q.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index c7fa4a3498fe..254b165f2b44 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -9,6 +9,7 @@
 #include <linux/types.h>
 
 struct dsa_switch;
+struct dsa_port;
 struct sk_buff;
 struct net_device;
 
@@ -45,9 +46,9 @@ void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port,
 
 u16 dsa_8021q_bridge_tx_fwd_offload_vid(int bridge_num);
 
-u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port);
+u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp);
 
-u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port);
+u16 dsa_tag_8021q_rx_vid(const struct dsa_port *dp);
 
 int dsa_8021q_rx_switch_id(u16 vid);
 
-- 
cgit v1.2.3


From 061514dbfb79910ef60eb40dd9fc528be3f45d62 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Mon, 18 Oct 2021 17:43:35 -0700
Subject: regulator: lp872x: Remove lp872x_dvs_state

After this driver was converted to gpiod, clang started warning:

vers/regulator/lp872x.c:689:57: error: implicit conversion from
enumeration type 'enum lp872x_dvs_state' to different enumeration type
'enum gpiod_flags' [-Werror,-Wenum-conversion]
        dvs->gpio = devm_gpiod_get_optional(lp->dev, "ti,dvs", pinstate);
                    ~~~~~~~~~~~~~~~~~~~~~~~                    ^~~~~~~~
1 error generated.

lp872x_dvs_state was updated to have values from gpiod_flags but this is
not enough to avoid an implicit conversion warning from either GCC or
clang (although GCC enables this warning under -Wextra instead of -Wall
like clang so it is not seen under normal builds).

Eliminate lp872x_dvs_state in favor of using gpiod_flags everywhere so
that there is no more warning about an implicit conversion.

Fixes: 72bf80cf09c4 ("regulator: lp872x: replacing legacy gpio interface for gpiod")
Link: https://github.com/ClangBuiltLinux/linux/issues/1481
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/20211019004335.193492-1-nathan@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/lp872x.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/lp872x.h b/include/linux/regulator/lp872x.h
index 8e7e0343c6e1..b62e45aa1dd3 100644
--- a/include/linux/regulator/lp872x.h
+++ b/include/linux/regulator/lp872x.h
@@ -40,11 +40,6 @@ enum lp872x_regulator_id {
 	LP872X_ID_MAX,
 };
 
-enum lp872x_dvs_state {
-	DVS_LOW  = GPIOD_OUT_LOW,
-	DVS_HIGH = GPIOD_OUT_HIGH,
-};
-
 enum lp872x_dvs_sel {
 	SEL_V1,
 	SEL_V2,
@@ -59,7 +54,7 @@ enum lp872x_dvs_sel {
 struct lp872x_dvs {
 	struct gpio_desc *gpio;
 	enum lp872x_dvs_sel vsel;
-	enum lp872x_dvs_state init_state;
+	enum gpiod_flags init_state;
 };
 
 /**
-- 
cgit v1.2.3


From 6a8b5bb0f1350fc4cf398435a1119db12b0bd50e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ma=C3=ADra=20Canal?= <maira.canal@usp.br>
Date: Sun, 17 Oct 2021 15:06:39 -0300
Subject: regulator: tps62360: replacing legacy gpio interface for gpiod
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removing all linux/gpio.h and linux/of_gpio.h dependencies and replacing
them with the gpiod interface.

Signed-off-by: Maíra Canal <maira.canal@usp.br>
Link: https://lore.kernel.org/r/YWxmL2baF5AdzyHv@fedora
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/tps62360.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/tps62360.h b/include/linux/regulator/tps62360.h
index 94a90c06f1e5..398e74a1d941 100644
--- a/include/linux/regulator/tps62360.h
+++ b/include/linux/regulator/tps62360.h
@@ -19,10 +19,6 @@
  * @en_discharge: Enable discharge the output capacitor via internal
  *                register.
  * @en_internal_pulldn: internal pull down enable or not.
- * @vsel0_gpio: Gpio number for vsel0. It should be -1 if this is tied with
- *              fixed logic.
- * @vsel1_gpio: Gpio number for vsel1. It should be -1 if this is tied with
- *              fixed logic.
  * @vsel0_def_state: Default state of vsel0. 1 if it is high else 0.
  * @vsel1_def_state: Default state of vsel1. 1 if it is high else 0.
  */
@@ -30,8 +26,6 @@ struct tps62360_regulator_platform_data {
 	struct regulator_init_data *reg_init_data;
 	bool en_discharge;
 	bool en_internal_pulldn;
-	int vsel0_gpio;
-	int vsel1_gpio;
 	int vsel0_def_state;
 	int vsel1_def_state;
 };
-- 
cgit v1.2.3


From 4570ddda43387e5a130dd85e71a1947b0c11da77 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 12 Mar 2021 14:04:07 +0100
Subject: powercap/drivers/dtpm: Encapsulate even more the code

In order to increase the self-encapsulation of the dtpm generic code,
the following changes are adding a power update ops to the dtpm
ops. That allows the generic code to call directly the dtpm backend
function to update the power values.

The power update function does compute the power characteristics when
the function is invoked. In the case of the CPUs, the power
consumption depends on the number of online CPUs. The online CPUs mask
is not up to date at CPUHP_AP_ONLINE_DYN state in the tear down
callback. That is the reason why the online / offline are at separate
state. As there is already an existing state for DTPM, this one is
only moved to the DEAD state, so there is no addition of new state
with these changes. The dtpm node is not removed when the cpu is
unplugged.

That simplifies the code for the next changes and results in a more
self-encapsulated code.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lore.kernel.org/r/20210312130411.29833-1-daniel.lezcano@linaro.org
---
 include/linux/cpuhotplug.h | 2 +-
 include/linux/dtpm.h       | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 832d8a74fa59..ad9a34a80440 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -97,6 +97,7 @@ enum cpuhp_state {
 	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
+	CPUHP_AP_DTPM_CPU_DEAD,
 	CPUHP_WORKQUEUE_PREP,
 	CPUHP_POWER_NUMA_PREPARE,
 	CPUHP_HRTIMERS_PREPARE,
@@ -242,7 +243,6 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
 	CPUHP_AP_X86_HPET_ONLINE,
 	CPUHP_AP_X86_KVM_CLK_ONLINE,
-	CPUHP_AP_DTPM_CPU_ONLINE,
 	CPUHP_AP_ACTIVE,
 	CPUHP_ONLINE,
 };
diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h
index e80a332e3d8a..acf8d3638988 100644
--- a/include/linux/dtpm.h
+++ b/include/linux/dtpm.h
@@ -29,6 +29,7 @@ struct dtpm {
 struct dtpm_ops {
 	u64 (*set_power_uw)(struct dtpm *, u64);
 	u64 (*get_power_uw)(struct dtpm *);
+	int (*update_power_uw)(struct dtpm *);
 	void (*release)(struct dtpm *);
 };
 
@@ -62,7 +63,7 @@ static inline struct dtpm *to_dtpm(struct powercap_zone *zone)
 	return container_of(zone, struct dtpm, zone);
 }
 
-int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max);
+int dtpm_update_power(struct dtpm *dtpm);
 
 int dtpm_release_zone(struct powercap_zone *pcz);
 
-- 
cgit v1.2.3


From 7a89d7eacf8e84f2afb94db5ae9d9f9faa93f01c Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 12 Mar 2021 14:04:09 +0100
Subject: powercap/drivers/dtpm: Simplify the dtpm table

The dtpm table is an array of pointers, that forces the user of the
table to define initdata along with the declaration of the table
entry. It is more efficient to create an array of dtpm structure, so
the declaration of the table entry can be done by initializing the
different fields.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lore.kernel.org/r/20210312130411.29833-3-daniel.lezcano@linaro.org
---
 include/linux/dtpm.h | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h
index acf8d3638988..1e53db6bd5f9 100644
--- a/include/linux/dtpm.h
+++ b/include/linux/dtpm.h
@@ -33,25 +33,23 @@ struct dtpm_ops {
 	void (*release)(struct dtpm *);
 };
 
-struct dtpm_descr;
-
-typedef int (*dtpm_init_t)(struct dtpm_descr *);
+typedef int (*dtpm_init_t)(void);
 
 struct dtpm_descr {
-	struct dtpm *parent;
-	const char *name;
 	dtpm_init_t init;
 };
 
 /* Init section thermal table */
-extern struct dtpm_descr *__dtpm_table[];
-extern struct dtpm_descr *__dtpm_table_end[];
+extern struct dtpm_descr __dtpm_table[];
+extern struct dtpm_descr __dtpm_table_end[];
 
-#define DTPM_TABLE_ENTRY(name)			\
-	static typeof(name) *__dtpm_table_entry_##name	\
-	__used __section("__dtpm_table") = &name
+#define DTPM_TABLE_ENTRY(name, __init)				\
+	static struct dtpm_descr __dtpm_table_entry_##name	\
+	__used __section("__dtpm_table") = {			\
+		.init = __init,					\
+	}
 
-#define DTPM_DECLARE(name)	DTPM_TABLE_ENTRY(name)
+#define DTPM_DECLARE(name, init)	DTPM_TABLE_ENTRY(name, init)
 
 #define for_each_dtpm_table(__dtpm)	\
 	for (__dtpm = __dtpm_table;	\
-- 
cgit v1.2.3


From d2cdc6adc30879d81160199fc7c6ab890fc4bd4c Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 12 Mar 2021 14:04:10 +0100
Subject: powercap/drivers/dtpm: Use container_of instead of a private data
 field

The dtpm framework provides an API to allocate a dtpm node. However
when a backend dtpm driver needs to allocate a dtpm node it must
define its own structure and store the pointer of this structure in
the private field of the dtpm structure.

It is more elegant to use the container_of macro and add the dtpm
structure inside the dtpm backend specific structure. The code will be
able to deal properly with the dtpm structure as a generic entity,
making all this even more self-encapsulated.

The dtpm_alloc() function does no longer make sense as the dtpm
structure will be allocated when allocating the device specific dtpm
structure. The dtpm_init() is provided instead.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lore.kernel.org/r/20210312130411.29833-4-daniel.lezcano@linaro.org
---
 include/linux/dtpm.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h
index 1e53db6bd5f9..2890f6370eb9 100644
--- a/include/linux/dtpm.h
+++ b/include/linux/dtpm.h
@@ -23,7 +23,6 @@ struct dtpm {
 	u64 power_max;
 	u64 power_min;
 	int weight;
-	void *private;
 };
 
 struct dtpm_ops {
@@ -65,7 +64,7 @@ int dtpm_update_power(struct dtpm *dtpm);
 
 int dtpm_release_zone(struct powercap_zone *pcz);
 
-struct dtpm *dtpm_alloc(struct dtpm_ops *ops);
+void dtpm_init(struct dtpm *dtpm, struct dtpm_ops *ops);
 
 void dtpm_unregister(struct dtpm *dtpm);
 
-- 
cgit v1.2.3


From 3b13c168186c115501ee7d194460ba2f8c825155 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 21 Oct 2021 14:30:51 +0100
Subject: percpu_ref: percpu_ref_tryget_live() version holding RCU

Add percpu_ref_tryget_live_rcu(), which is a version of
percpu_ref_tryget_live() but the user is responsible for enclosing it in
a RCU read lock section.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Acked-by: Dennis Zhou <dennis@kernel.org>
Link: https://lore.kernel.org/r/3066500d7a6eb3e03f10adf98b87fdb3b1c49db8.1634822969.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/percpu-refcount.h | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index ae16a9856305..b31d3f3312ce 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -266,6 +266,28 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
 	return percpu_ref_tryget_many(ref, 1);
 }
 
+/**
+ * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the
+ * caller is responsible for taking RCU.
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref)
+{
+	unsigned long __percpu *percpu_count;
+	bool ret = false;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (likely(__ref_is_percpu(ref, &percpu_count))) {
+		this_cpu_inc(*percpu_count);
+		ret = true;
+	} else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
+		ret = atomic_long_inc_not_zero(&ref->data->count);
+	}
+	return ret;
+}
+
 /**
  * percpu_ref_tryget_live - try to increment a live percpu refcount
  * @ref: percpu_ref to try-get
@@ -283,20 +305,11 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
  */
 static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
 {
-	unsigned long __percpu *percpu_count;
 	bool ret = false;
 
 	rcu_read_lock();
-
-	if (__ref_is_percpu(ref, &percpu_count)) {
-		this_cpu_inc(*percpu_count);
-		ret = true;
-	} else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
-		ret = atomic_long_inc_not_zero(&ref->data->count);
-	}
-
+	ret = percpu_ref_tryget_live_rcu(ref);
 	rcu_read_unlock();
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From f059a1d2e23a165bf86e33673c6a7535a08c6341 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Wed, 22 Sep 2021 20:37:08 +0800
Subject: block: Add invalidate_disk() helper to invalidate the gendisk

To hide internal implementation and simplify some driver code,
this adds a helper to invalidate the gendisk. It will clean the
gendisk's associated buffer/page caches and reset its internal
states.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210922123711.187-2-xieyongji@bytedance.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c70bc5fce4db..13f313ab99e7 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -213,6 +213,8 @@ static inline int add_disk(struct gendisk *disk)
 }
 extern void del_gendisk(struct gendisk *gp);
 
+void invalidate_disk(struct gendisk *disk);
+
 void set_disk_ro(struct gendisk *disk, bool read_only);
 
 static inline int get_disk_ro(struct gendisk *disk)
-- 
cgit v1.2.3


From 1e8d44bddf57f6d878e083f281a34d5c88feb7db Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 18 Oct 2021 11:04:51 -0700
Subject: blk-crypto: rename keyslot-manager files to blk-crypto-profile

In preparation for renaming struct blk_keyslot_manager to struct
blk_crypto_profile, rename the keyslot-manager.h and keyslot-manager.c
source files.  Renaming these files separately before making a lot of
changes to their contents makes it easier for git to understand that
they were renamed.

Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # For MMC
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20211018180453.40441-3-ebiggers@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-crypto-profile.h | 120 +++++++++++++++++++++++++++++++++++++
 include/linux/keyslot-manager.h    | 120 -------------------------------------
 include/linux/mmc/host.h           |   2 +-
 3 files changed, 121 insertions(+), 121 deletions(-)
 create mode 100644 include/linux/blk-crypto-profile.h
 delete mode 100644 include/linux/keyslot-manager.h

(limited to 'include/linux')

diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h
new file mode 100644
index 000000000000..a27605e2f826
--- /dev/null
+++ b/include/linux/blk-crypto-profile.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2019 Google LLC
+ */
+
+#ifndef __LINUX_KEYSLOT_MANAGER_H
+#define __LINUX_KEYSLOT_MANAGER_H
+
+#include <linux/bio.h>
+#include <linux/blk-crypto.h>
+
+struct blk_keyslot_manager;
+
+/**
+ * struct blk_ksm_ll_ops - functions to manage keyslots in hardware
+ * @keyslot_program:	Program the specified key into the specified slot in the
+ *			inline encryption hardware.
+ * @keyslot_evict:	Evict key from the specified keyslot in the hardware.
+ *			The key is provided so that e.g. dm layers can evict
+ *			keys from the devices that they map over.
+ *			Returns 0 on success, -errno otherwise.
+ *
+ * This structure should be provided by storage device drivers when they set up
+ * a keyslot manager - this structure holds the function ptrs that the keyslot
+ * manager will use to manipulate keyslots in the hardware.
+ */
+struct blk_ksm_ll_ops {
+	int (*keyslot_program)(struct blk_keyslot_manager *ksm,
+			       const struct blk_crypto_key *key,
+			       unsigned int slot);
+	int (*keyslot_evict)(struct blk_keyslot_manager *ksm,
+			     const struct blk_crypto_key *key,
+			     unsigned int slot);
+};
+
+struct blk_keyslot_manager {
+	/*
+	 * The struct blk_ksm_ll_ops that this keyslot manager will use
+	 * to perform operations like programming and evicting keys on the
+	 * device
+	 */
+	struct blk_ksm_ll_ops ksm_ll_ops;
+
+	/*
+	 * The maximum number of bytes supported for specifying the data unit
+	 * number.
+	 */
+	unsigned int max_dun_bytes_supported;
+
+	/*
+	 * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents
+	 * whether a crypto mode and data unit size are supported. The i'th
+	 * bit of crypto_mode_supported[crypto_mode] is set iff a data unit
+	 * size of (1 << i) is supported. We only support data unit sizes
+	 * that are powers of 2.
+	 */
+	unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX];
+
+	/* Device for runtime power management (NULL if none) */
+	struct device *dev;
+
+	/* Here onwards are *private* fields for internal keyslot manager use */
+
+	unsigned int num_slots;
+
+	/* Protects programming and evicting keys from the device */
+	struct rw_semaphore lock;
+
+	/* List of idle slots, with least recently used slot at front */
+	wait_queue_head_t idle_slots_wait_queue;
+	struct list_head idle_slots;
+	spinlock_t idle_slots_lock;
+
+	/*
+	 * Hash table which maps struct *blk_crypto_key to keyslots, so that we
+	 * can find a key's keyslot in O(1) time rather than O(num_slots).
+	 * Protected by 'lock'.
+	 */
+	struct hlist_head *slot_hashtable;
+	unsigned int log_slot_ht_size;
+
+	/* Per-keyslot data */
+	struct blk_ksm_keyslot *slots;
+};
+
+int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots);
+
+int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm,
+		      unsigned int num_slots);
+
+blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
+				      const struct blk_crypto_key *key,
+				      struct blk_ksm_keyslot **slot_ptr);
+
+unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot);
+
+void blk_ksm_put_slot(struct blk_ksm_keyslot *slot);
+
+bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm,
+				  const struct blk_crypto_config *cfg);
+
+int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
+		      const struct blk_crypto_key *key);
+
+void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm);
+
+void blk_ksm_destroy(struct blk_keyslot_manager *ksm);
+
+void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
+			     const struct blk_keyslot_manager *child);
+
+void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm);
+
+bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
+			 struct blk_keyslot_manager *ksm_subset);
+
+void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
+				 struct blk_keyslot_manager *reference_ksm);
+
+#endif /* __LINUX_KEYSLOT_MANAGER_H */
diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h
deleted file mode 100644
index a27605e2f826..000000000000
--- a/include/linux/keyslot-manager.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 2019 Google LLC
- */
-
-#ifndef __LINUX_KEYSLOT_MANAGER_H
-#define __LINUX_KEYSLOT_MANAGER_H
-
-#include <linux/bio.h>
-#include <linux/blk-crypto.h>
-
-struct blk_keyslot_manager;
-
-/**
- * struct blk_ksm_ll_ops - functions to manage keyslots in hardware
- * @keyslot_program:	Program the specified key into the specified slot in the
- *			inline encryption hardware.
- * @keyslot_evict:	Evict key from the specified keyslot in the hardware.
- *			The key is provided so that e.g. dm layers can evict
- *			keys from the devices that they map over.
- *			Returns 0 on success, -errno otherwise.
- *
- * This structure should be provided by storage device drivers when they set up
- * a keyslot manager - this structure holds the function ptrs that the keyslot
- * manager will use to manipulate keyslots in the hardware.
- */
-struct blk_ksm_ll_ops {
-	int (*keyslot_program)(struct blk_keyslot_manager *ksm,
-			       const struct blk_crypto_key *key,
-			       unsigned int slot);
-	int (*keyslot_evict)(struct blk_keyslot_manager *ksm,
-			     const struct blk_crypto_key *key,
-			     unsigned int slot);
-};
-
-struct blk_keyslot_manager {
-	/*
-	 * The struct blk_ksm_ll_ops that this keyslot manager will use
-	 * to perform operations like programming and evicting keys on the
-	 * device
-	 */
-	struct blk_ksm_ll_ops ksm_ll_ops;
-
-	/*
-	 * The maximum number of bytes supported for specifying the data unit
-	 * number.
-	 */
-	unsigned int max_dun_bytes_supported;
-
-	/*
-	 * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents
-	 * whether a crypto mode and data unit size are supported. The i'th
-	 * bit of crypto_mode_supported[crypto_mode] is set iff a data unit
-	 * size of (1 << i) is supported. We only support data unit sizes
-	 * that are powers of 2.
-	 */
-	unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX];
-
-	/* Device for runtime power management (NULL if none) */
-	struct device *dev;
-
-	/* Here onwards are *private* fields for internal keyslot manager use */
-
-	unsigned int num_slots;
-
-	/* Protects programming and evicting keys from the device */
-	struct rw_semaphore lock;
-
-	/* List of idle slots, with least recently used slot at front */
-	wait_queue_head_t idle_slots_wait_queue;
-	struct list_head idle_slots;
-	spinlock_t idle_slots_lock;
-
-	/*
-	 * Hash table which maps struct *blk_crypto_key to keyslots, so that we
-	 * can find a key's keyslot in O(1) time rather than O(num_slots).
-	 * Protected by 'lock'.
-	 */
-	struct hlist_head *slot_hashtable;
-	unsigned int log_slot_ht_size;
-
-	/* Per-keyslot data */
-	struct blk_ksm_keyslot *slots;
-};
-
-int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots);
-
-int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm,
-		      unsigned int num_slots);
-
-blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
-				      const struct blk_crypto_key *key,
-				      struct blk_ksm_keyslot **slot_ptr);
-
-unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot);
-
-void blk_ksm_put_slot(struct blk_ksm_keyslot *slot);
-
-bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm,
-				  const struct blk_crypto_config *cfg);
-
-int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
-		      const struct blk_crypto_key *key);
-
-void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm);
-
-void blk_ksm_destroy(struct blk_keyslot_manager *ksm);
-
-void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
-			     const struct blk_keyslot_manager *child);
-
-void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm);
-
-bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
-			 struct blk_keyslot_manager *ksm_subset);
-
-void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
-				 struct blk_keyslot_manager *reference_ksm);
-
-#endif /* __LINUX_KEYSLOT_MANAGER_H */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 0c0c9a0fdf57..725b1de41767 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -15,7 +15,7 @@
 #include <linux/mmc/card.h>
 #include <linux/mmc/pm.h>
 #include <linux/dma-direction.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
 
 struct mmc_ios {
 	unsigned int	clock;			/* clock rate */
-- 
cgit v1.2.3


From cb77cb5abe1f4fae4a33b735606aae22f9eaa1c7 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 18 Oct 2021 11:04:52 -0700
Subject: blk-crypto: rename blk_keyslot_manager to blk_crypto_profile

blk_keyslot_manager is misnamed because it doesn't necessarily manage
keyslots.  It actually does several different things:

  - Contains the crypto capabilities of the device.

  - Provides functions to control the inline encryption hardware.
    Originally these were just for programming/evicting keyslots;
    however, new functionality (hardware-wrapped keys) will require new
    functions here which are unrelated to keyslots.  Moreover,
    device-mapper devices already (ab)use "keyslot_evict" to pass key
    eviction requests to their underlying devices even though
    device-mapper devices don't have any keyslots themselves (so it
    really should be "evict_key", not "keyslot_evict").

  - Sometimes (but not always!) it manages keyslots.  Originally it
    always did, but device-mapper devices don't have keyslots
    themselves, so they use a "passthrough keyslot manager" which
    doesn't actually manage keyslots.  This hack works, but the
    terminology is unnatural.  Also, some hardware doesn't have keyslots
    and thus also uses a "passthrough keyslot manager" (support for such
    hardware is yet to be upstreamed, but it will happen eventually).

Let's stop having keyslot managers which don't actually manage keyslots.
Instead, rename blk_keyslot_manager to blk_crypto_profile.

This is a fairly big change, since for consistency it also has to update
keyslot manager-related function names, variable names, and comments --
not just the actual struct name.  However it's still a fairly
straightforward change, as it doesn't change any actual functionality.

Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # For MMC
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20211018180453.40441-4-ebiggers@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-crypto-profile.h | 164 ++++++++++++++++++++++++-------------
 include/linux/blk-mq.h             |   2 +-
 include/linux/blkdev.h             |  16 ++--
 include/linux/device-mapper.h      |   4 +-
 include/linux/mmc/host.h           |   2 +-
 5 files changed, 117 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h
index a27605e2f826..bbab65bd5428 100644
--- a/include/linux/blk-crypto-profile.h
+++ b/include/linux/blk-crypto-profile.h
@@ -3,67 +3,113 @@
  * Copyright 2019 Google LLC
  */
 
-#ifndef __LINUX_KEYSLOT_MANAGER_H
-#define __LINUX_KEYSLOT_MANAGER_H
+#ifndef __LINUX_BLK_CRYPTO_PROFILE_H
+#define __LINUX_BLK_CRYPTO_PROFILE_H
 
 #include <linux/bio.h>
 #include <linux/blk-crypto.h>
 
-struct blk_keyslot_manager;
+struct blk_crypto_profile;
 
 /**
- * struct blk_ksm_ll_ops - functions to manage keyslots in hardware
- * @keyslot_program:	Program the specified key into the specified slot in the
- *			inline encryption hardware.
- * @keyslot_evict:	Evict key from the specified keyslot in the hardware.
- *			The key is provided so that e.g. dm layers can evict
- *			keys from the devices that they map over.
- *			Returns 0 on success, -errno otherwise.
+ * struct blk_crypto_ll_ops - functions to control inline encryption hardware
  *
- * This structure should be provided by storage device drivers when they set up
- * a keyslot manager - this structure holds the function ptrs that the keyslot
- * manager will use to manipulate keyslots in the hardware.
+ * Low-level operations for controlling inline encryption hardware.  This
+ * interface must be implemented by storage drivers that support inline
+ * encryption.  All functions may sleep, are serialized by profile->lock, and
+ * are never called while profile->dev (if set) is runtime-suspended.
  */
-struct blk_ksm_ll_ops {
-	int (*keyslot_program)(struct blk_keyslot_manager *ksm,
+struct blk_crypto_ll_ops {
+
+	/**
+	 * @keyslot_program: Program a key into the inline encryption hardware.
+	 *
+	 * Program @key into the specified @slot in the inline encryption
+	 * hardware, overwriting any key that the keyslot may already contain.
+	 * The keyslot is guaranteed to not be in-use by any I/O.
+	 *
+	 * This is required if the device has keyslots.  Otherwise (i.e. if the
+	 * device is a layered device, or if the device is real hardware that
+	 * simply doesn't have the concept of keyslots) it is never called.
+	 *
+	 * Must return 0 on success, or -errno on failure.
+	 */
+	int (*keyslot_program)(struct blk_crypto_profile *profile,
 			       const struct blk_crypto_key *key,
 			       unsigned int slot);
-	int (*keyslot_evict)(struct blk_keyslot_manager *ksm,
+
+	/**
+	 * @keyslot_evict: Evict a key from the inline encryption hardware.
+	 *
+	 * If the device has keyslots, this function must evict the key from the
+	 * specified @slot.  The slot will contain @key, but there should be no
+	 * need for the @key argument to be used as @slot should be sufficient.
+	 * The keyslot is guaranteed to not be in-use by any I/O.
+	 *
+	 * If the device doesn't have keyslots itself, this function must evict
+	 * @key from any underlying devices.  @slot won't be valid in this case.
+	 *
+	 * If there are no keyslots and no underlying devices, this function
+	 * isn't required.
+	 *
+	 * Must return 0 on success, or -errno on failure.
+	 */
+	int (*keyslot_evict)(struct blk_crypto_profile *profile,
 			     const struct blk_crypto_key *key,
 			     unsigned int slot);
 };
 
-struct blk_keyslot_manager {
-	/*
-	 * The struct blk_ksm_ll_ops that this keyslot manager will use
-	 * to perform operations like programming and evicting keys on the
-	 * device
+/**
+ * struct blk_crypto_profile - inline encryption profile for a device
+ *
+ * This struct contains a storage device's inline encryption capabilities (e.g.
+ * the supported crypto algorithms), driver-provided functions to control the
+ * inline encryption hardware (e.g. programming and evicting keys), and optional
+ * device-independent keyslot management data.
+ */
+struct blk_crypto_profile {
+
+	/* public: Drivers must initialize the following fields. */
+
+	/**
+	 * @ll_ops: Driver-provided functions to control the inline encryption
+	 * hardware, e.g. program and evict keys.
 	 */
-	struct blk_ksm_ll_ops ksm_ll_ops;
+	struct blk_crypto_ll_ops ll_ops;
 
-	/*
-	 * The maximum number of bytes supported for specifying the data unit
-	 * number.
+	/**
+	 * @max_dun_bytes_supported: The maximum number of bytes supported for
+	 * specifying the data unit number (DUN).  Specifically, the range of
+	 * supported DUNs is 0 through (1 << (8 * max_dun_bytes_supported)) - 1.
 	 */
 	unsigned int max_dun_bytes_supported;
 
-	/*
-	 * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents
-	 * whether a crypto mode and data unit size are supported. The i'th
-	 * bit of crypto_mode_supported[crypto_mode] is set iff a data unit
-	 * size of (1 << i) is supported. We only support data unit sizes
-	 * that are powers of 2.
+	/**
+	 * @modes_supported: Array of bitmasks that specifies whether each
+	 * combination of crypto mode and data unit size is supported.
+	 * Specifically, the i'th bit of modes_supported[crypto_mode] is set if
+	 * crypto_mode can be used with a data unit size of (1 << i).  Note that
+	 * only data unit sizes that are powers of 2 can be supported.
 	 */
-	unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX];
+	unsigned int modes_supported[BLK_ENCRYPTION_MODE_MAX];
 
-	/* Device for runtime power management (NULL if none) */
+	/**
+	 * @dev: An optional device for runtime power management.  If the driver
+	 * provides this device, it will be runtime-resumed before any function
+	 * in @ll_ops is called and will remain resumed during the call.
+	 */
 	struct device *dev;
 
-	/* Here onwards are *private* fields for internal keyslot manager use */
+	/* private: The following fields shouldn't be accessed by drivers. */
 
+	/* Number of keyslots, or 0 if not applicable */
 	unsigned int num_slots;
 
-	/* Protects programming and evicting keys from the device */
+	/*
+	 * Serializes all calls to functions in @ll_ops as well as all changes
+	 * to @slot_hashtable.  This can also be taken in read mode to look up
+	 * keyslots while ensuring that they can't be changed concurrently.
+	 */
 	struct rw_semaphore lock;
 
 	/* List of idle slots, with least recently used slot at front */
@@ -80,41 +126,41 @@ struct blk_keyslot_manager {
 	unsigned int log_slot_ht_size;
 
 	/* Per-keyslot data */
-	struct blk_ksm_keyslot *slots;
+	struct blk_crypto_keyslot *slots;
 };
 
-int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots);
-
-int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm,
-		      unsigned int num_slots);
+int blk_crypto_profile_init(struct blk_crypto_profile *profile,
+			    unsigned int num_slots);
 
-blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
-				      const struct blk_crypto_key *key,
-				      struct blk_ksm_keyslot **slot_ptr);
+int devm_blk_crypto_profile_init(struct device *dev,
+				 struct blk_crypto_profile *profile,
+				 unsigned int num_slots);
 
-unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot);
+unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot);
 
-void blk_ksm_put_slot(struct blk_ksm_keyslot *slot);
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+				    const struct blk_crypto_key *key,
+				    struct blk_crypto_keyslot **slot_ptr);
 
-bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm,
-				  const struct blk_crypto_config *cfg);
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
 
-int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
-		      const struct blk_crypto_key *key);
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+				const struct blk_crypto_config *cfg);
 
-void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm);
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+			   const struct blk_crypto_key *key);
 
-void blk_ksm_destroy(struct blk_keyslot_manager *ksm);
+void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile);
 
-void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
-			     const struct blk_keyslot_manager *child);
+void blk_crypto_profile_destroy(struct blk_crypto_profile *profile);
 
-void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm);
+void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
+				       const struct blk_crypto_profile *child);
 
-bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
-			 struct blk_keyslot_manager *ksm_subset);
+bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target,
+				 const struct blk_crypto_profile *reference);
 
-void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
-				 struct blk_keyslot_manager *reference_ksm);
+void blk_crypto_update_capabilities(struct blk_crypto_profile *dst,
+				    const struct blk_crypto_profile *src);
 
-#endif /* __LINUX_KEYSLOT_MANAGER_H */
+#endif /* __LINUX_BLK_CRYPTO_PROFILE_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e13780236550..b4039fdf1b04 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -133,7 +133,7 @@ struct request {
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 	struct bio_crypt_ctx *crypt_ctx;
-	struct blk_ksm_keyslot *crypt_keyslot;
+	struct blk_crypto_keyslot *crypt_keyslot;
 #endif
 
 	unsigned short write_hint;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c7b1e9355123..f72ccb2829db 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -30,7 +30,7 @@ struct pr_ops;
 struct rq_qos;
 struct blk_queue_stats;
 struct blk_stat_callback;
-struct blk_keyslot_manager;
+struct blk_crypto_profile;
 
 /* Must be consistent with blk_mq_poll_stats_bkt() */
 #define BLK_MQ_POLL_STATS_BKTS 16
@@ -224,8 +224,7 @@ struct request_queue {
 	unsigned int		dma_alignment;
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
-	/* Inline crypto capabilities */
-	struct blk_keyslot_manager *ksm;
+	struct blk_crypto_profile *crypto_profile;
 #endif
 
 	unsigned int		rq_timeout;
@@ -1142,19 +1141,20 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 
-bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q);
+bool blk_crypto_register(struct blk_crypto_profile *profile,
+			 struct request_queue *q);
 
-void blk_ksm_unregister(struct request_queue *q);
+void blk_crypto_unregister(struct request_queue *q);
 
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
 
-static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm,
-				    struct request_queue *q)
+static inline bool blk_crypto_register(struct blk_crypto_profile *profile,
+				       struct request_queue *q)
 {
 	return true;
 }
 
-static inline void blk_ksm_unregister(struct request_queue *q) { }
+static inline void blk_crypto_unregister(struct request_queue *q) { }
 
 #endif /* CONFIG_BLK_INLINE_ENCRYPTION */
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 114553b487ef..a7df155ea49b 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -576,9 +576,9 @@ struct dm_table *dm_swap_table(struct mapped_device *md,
 			       struct dm_table *t);
 
 /*
- * Table keyslot manager functions
+ * Table blk_crypto_profile functions
  */
-void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm);
+void dm_destroy_crypto_profile(struct blk_crypto_profile *profile);
 
 /*-----------------------------------------------------------------
  * Macros.
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 725b1de41767..52eae8c45b8d 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -492,7 +492,7 @@ struct mmc_host {
 
 	/* Inline encryption support */
 #ifdef CONFIG_MMC_CRYPTO
-	struct blk_keyslot_manager ksm;
+	struct blk_crypto_profile crypto_profile;
 #endif
 
 	/* Host Software Queue support */
-- 
cgit v1.2.3


From f64dd4627ec6edc39bf1430fe6dbc923d2300a88 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 8 Oct 2021 11:13:34 +0200
Subject: ftrace: Add multi direct register/unregister interface

Adding interface to register multiple direct functions
within single call. Adding following functions:

  register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
  unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)

The register_ftrace_direct_multi registers direct function (addr)
with all functions in ops filter. The ops filter can be updated
before with ftrace_set_filter_ip calls.

All requested functions must not have direct function currently
registered, otherwise register_ftrace_direct_multi will fail.

The unregister_ftrace_direct_multi unregisters ops related direct
functions.

Link: https://lkml.kernel.org/r/20211008091336.33616-7-jolsa@kernel.org

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 16a7baaba702..0158261cac9f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -324,7 +324,10 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
 				unsigned long old_addr,
 				unsigned long new_addr);
 unsigned long ftrace_find_rec_direct(unsigned long ip);
+int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
+int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
 #else
+struct ftrace_ops;
 # define ftrace_direct_func_count 0
 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr)
 {
@@ -354,6 +357,14 @@ static inline unsigned long ftrace_find_rec_direct(unsigned long ip)
 {
 	return 0;
 }
+static inline int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+	return -ENODEV;
+}
+static inline int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+	return -ENODEV;
+}
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
 #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-- 
cgit v1.2.3


From ccf5a89efd6f0a9483cea8acd4a0822b1a47e59a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 8 Oct 2021 11:13:35 +0200
Subject: ftrace: Add multi direct modify interface

Adding interface to modify registered direct function
for ftrace_ops. Adding following function:

   modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)

The function changes the currently registered direct
function for all attached functions.

Link: https://lkml.kernel.org/r/20211008091336.33616-8-jolsa@kernel.org

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0158261cac9f..9999e29187de 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -326,6 +326,8 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
 unsigned long ftrace_find_rec_direct(unsigned long ip);
 int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
 int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
+int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
+
 #else
 struct ftrace_ops;
 # define ftrace_direct_func_count 0
@@ -365,6 +367,10 @@ static inline int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigne
 {
 	return -ENODEV;
 }
+static inline int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+	return -ENODEV;
+}
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
 #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-- 
cgit v1.2.3


From bce5c81cb31f7f124ce231ec79df9e85a8bac132 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 19 Oct 2021 09:25:20 -0400
Subject: tracing: Explain the trace recursion transition bit better

The current text of the explanation of the transition bit in the trace
recursion protection is not very clear. Improve the text, so that when all
the archs no longer have the issue of tracing between a start of a new
(interrupt) context and updating the preempt_count to reflect the new
context, that it may be removed.

Link: https://lore.kernel.org/all/20211018220203.064a42ed@gandalf.local.home/

Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_recursion.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index 1d8cce02c3fb..24f284eb55a7 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -168,8 +168,12 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 	bit = trace_get_context_bit() + start;
 	if (unlikely(val & (1 << bit))) {
 		/*
-		 * It could be that preempt_count has not been updated during
-		 * a switch between contexts. Allow for a single recursion.
+		 * If an interrupt occurs during a trace, and another trace
+		 * happens in that interrupt but before the preempt_count is
+		 * updated to reflect the new interrupt context, then this
+		 * will think a recursion occurred, and the event will be dropped.
+		 * Let a single instance happen via the TRANSITION_BIT to
+		 * not drop those events.
 		 */
 		bit = TRACE_TRANSITION_BIT;
 		if (val & (1 << bit)) {
-- 
cgit v1.2.3


From 17b5b576ff5faff99a4c8140d521cd4d7fff5c16 Mon Sep 17 00:00:00 2001
From: Vincent Whitchurch <vincent.whitchurch@axis.com>
Date: Thu, 7 Oct 2021 15:46:39 +0200
Subject: mux: add support for delay after muxing

Hardware may require some time for the muxed analog signals to settle
after the muxing is changed.  Allow users of the mux subsystem to
specify this delay with the new mux_control_select_delay() function (and
the _try equivalent).

Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
Tested-by: Lars-Peter Clausen <lars@metafoo.de>
Acked-by: Peter Rosin <peda@axentia.se>
Link: https://lore.kernel.org/r/20211007134641.13417-2-vincent.whitchurch@axis.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/mux/consumer.h | 23 +++++++++++++++++++----
 include/linux/mux/driver.h   |  4 ++++
 2 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mux/consumer.h b/include/linux/mux/consumer.h
index 5fc6bb2fefad..7a09b040ac39 100644
--- a/include/linux/mux/consumer.h
+++ b/include/linux/mux/consumer.h
@@ -16,10 +16,25 @@ struct device;
 struct mux_control;
 
 unsigned int mux_control_states(struct mux_control *mux);
-int __must_check mux_control_select(struct mux_control *mux,
-				    unsigned int state);
-int __must_check mux_control_try_select(struct mux_control *mux,
-					unsigned int state);
+int __must_check mux_control_select_delay(struct mux_control *mux,
+					  unsigned int state,
+					  unsigned int delay_us);
+int __must_check mux_control_try_select_delay(struct mux_control *mux,
+					      unsigned int state,
+					      unsigned int delay_us);
+
+static inline int __must_check mux_control_select(struct mux_control *mux,
+						  unsigned int state)
+{
+	return mux_control_select_delay(mux, state, 0);
+}
+
+static inline int __must_check mux_control_try_select(struct mux_control *mux,
+						      unsigned int state)
+{
+	return mux_control_try_select_delay(mux, state, 0);
+}
+
 int mux_control_deselect(struct mux_control *mux);
 
 struct mux_control *mux_control_get(struct device *dev, const char *mux_name);
diff --git a/include/linux/mux/driver.h b/include/linux/mux/driver.h
index 627a2c6bc02d..18824064f8c0 100644
--- a/include/linux/mux/driver.h
+++ b/include/linux/mux/driver.h
@@ -12,6 +12,7 @@
 
 #include <dt-bindings/mux/mux.h>
 #include <linux/device.h>
+#include <linux/ktime.h>
 #include <linux/semaphore.h>
 
 struct mux_chip;
@@ -33,6 +34,7 @@ struct mux_control_ops {
  * @states:		The number of mux controller states.
  * @idle_state:		The mux controller state to use when inactive, or one
  *			of MUX_IDLE_AS_IS and MUX_IDLE_DISCONNECT.
+ * @last_change:	Timestamp of last change
  *
  * Mux drivers may only change @states and @idle_state, and may only do so
  * between allocation and registration of the mux controller. Specifically,
@@ -47,6 +49,8 @@ struct mux_control {
 
 	unsigned int states;
 	int idle_state;
+
+	ktime_t last_change;
 };
 
 /**
-- 
cgit v1.2.3


From 9eeb3aa33ae005526f672b394c1791578463513f Mon Sep 17 00:00:00 2001
From: Hengqi Chen <hengqi.chen@gmail.com>
Date: Thu, 21 Oct 2021 21:47:51 +0800
Subject: bpf: Add bpf_skc_to_unix_sock() helper

The helper is used in tracing programs to cast a socket
pointer to a unix_sock pointer.
The return value could be NULL if the casting is illegal.

Suggested-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20211021134752.1223426-2-hengqi.chen@gmail.com
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d604c8251d88..be3102b4554b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2093,6 +2093,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
+extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_proto;
 extern const struct bpf_func_proto bpf_snprintf_btf_proto;
 extern const struct bpf_func_proto bpf_snprintf_proto;
-- 
cgit v1.2.3


From d08fd747d0ed682b6c6c280ba454cafcad33563d Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Thu, 14 Oct 2021 16:35:11 +0200
Subject: Compiler Attributes: remove GCC 5.1 mention

GCC 5.1 is now the minimum version.

Acked-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 include/linux/compiler_attributes.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index e6ec63403965..87d1e773400c 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -104,7 +104,6 @@
 #define __deprecated
 
 /*
- * Optional: only supported since gcc >= 5.1
  * Optional: not supported by clang
  * Optional: not supported by icc
  *
-- 
cgit v1.2.3


From aba64c7da98330141dcdadd5612f088043a83696 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Wed, 20 Oct 2021 00:48:17 -0700
Subject: bpf: Add verified_insns to bpf_prog_info and fdinfo

This stat is currently printed in the verifier log and not stored
anywhere. To ease consumption of this data, add a field to bpf_prog_aux
so it can be exposed via BPF_OBJ_GET_INFO_BY_FD and fdinfo.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20211020074818.1017682-2-davemarchevsky@fb.com
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index be3102b4554b..31421c74ba08 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -887,6 +887,7 @@ struct bpf_prog_aux {
 	struct bpf_prog *prog;
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
+	u32 verified_insns;
 	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
 #ifdef CONFIG_SECURITY
-- 
cgit v1.2.3


From 7c00621dcaeea206d7489b3e8b50b1864841ae69 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Thu, 14 Oct 2021 15:23:31 +0200
Subject: compiler_types: mark __compiletime_assert failure as __noreturn

`__compiletime_assert` declares a fake `extern` function
which appears (to the compiler) to be called when the test fails.

Therefore, compilers may emit possibly-uninitialized warnings
in some cases, even if it will be an error anyway (for compilers
supporting the `error` attribute, e.g. GCC and Clang >= 14)
or a link failure (for those that do not, e.g. Clang < 14).

Annotating the fake function as `__noreturn` gives them
the information they need to avoid the warning,
e.g. see https://godbolt.org/z/x1v69jjYY.

Link: https://lore.kernel.org/llvm/202110100514.3h9CI4s0-lkp@intel.com/
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 include/linux/compiler_types.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index b6ff83a714ca..ca1a66b8cd2f 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -298,7 +298,13 @@ struct ftrace_likely_data {
 #ifdef __OPTIMIZE__
 # define __compiletime_assert(condition, msg, prefix, suffix)		\
 	do {								\
-		extern void prefix ## suffix(void) __compiletime_error(msg); \
+		/*							\
+		 * __noreturn is needed to give the compiler enough	\
+		 * information to avoid certain possibly-uninitialized	\
+		 * warnings (regardless of the build failing).		\
+		 */							\
+		__noreturn extern void prefix ## suffix(void)		\
+			__compiletime_error(msg);			\
 		if (!(condition))					\
 			prefix ## suffix();				\
 	} while (0)
-- 
cgit v1.2.3


From 008d3825a805557464c5e75f9eb806a3aa2f5e6d Mon Sep 17 00:00:00 2001
From: Eddie James <eajames@linux.ibm.com>
Date: Tue, 19 Oct 2021 15:53:04 -0500
Subject: fsi: occ: Use a large buffer for responses

Allocate a large buffer for each OCC to handle response data. This
removes memory allocation during an operation, and also allows for
the maximum amount of SBE FFDC.

Previously for the putsram and attn commands, only 32 words would have
been available, and for getsram, only up to the size of the transfer.
SBE FFDC might be up to 8Kb.

The SBE interface expects data to be specified in units of words (4
bytes), defined as OCC_MAX_RESP_WORDS.

This change allows the full FFDC capture to be implemented, where before
it was not available.

Signed-off-by: Eddie James <eajames@linux.ibm.com>
Link: https://lore.kernel.org/r/20211019205307.36946-2-eajames@linux.ibm.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 include/linux/fsi-occ.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsi-occ.h b/include/linux/fsi-occ.h
index d4cdc2aa6e33..7ee3dbd7f4b3 100644
--- a/include/linux/fsi-occ.h
+++ b/include/linux/fsi-occ.h
@@ -19,6 +19,8 @@ struct device;
 #define OCC_RESP_CRIT_OCB		0xE3
 #define OCC_RESP_CRIT_HW		0xE4
 
+#define OCC_MAX_RESP_WORDS		2048
+
 int fsi_occ_submit(struct device *dev, const void *request, size_t req_len,
 		   void *response, size_t *resp_len);
 
-- 
cgit v1.2.3


From dc0fd0acb6e0e8025a0a43ada54513b216254fac Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Thu, 21 Oct 2021 15:09:03 +0200
Subject: HID: surface-hid: Use correct event registry for managing HID events

Until now, we have only ever seen the REG-category registry being used
on devices addressed with target ID 2. In fact, we have only ever seen
Surface Aggregator Module (SAM) HID devices with target ID 2. For those
devices, the registry also has to be addressed with target ID 2.

Some devices, like the new Surface Laptop Studio, however, address their
HID devices on target ID 1. As a result of this, any target ID 2
commands time out. This includes event management commands addressed to
the target ID 2 REG-category registry. For these devices, the registry
has to be addressed via target ID 1 instead.

We therefore assume that the target ID of the registry to be used
depends on the target ID of the respective device. Implement this
accordingly.

Note that we currently allow the surface HID driver to only load against
devices with target ID 2, so these timeouts are not happening (yet).
This is just a preparation step before we allow the driver to load
against all target IDs.

Cc: stable@vger.kernel.org # 5.14+
Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20211021130904.862610-3-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/surface_aggregator/controller.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
index 068e1982ad37..74bfdffaf7b0 100644
--- a/include/linux/surface_aggregator/controller.h
+++ b/include/linux/surface_aggregator/controller.h
@@ -792,8 +792,8 @@ enum ssam_event_mask {
 #define SSAM_EVENT_REGISTRY_KIP	\
 	SSAM_EVENT_REGISTRY(SSAM_SSH_TC_KIP, 0x02, 0x27, 0x28)
 
-#define SSAM_EVENT_REGISTRY_REG \
-	SSAM_EVENT_REGISTRY(SSAM_SSH_TC_REG, 0x02, 0x01, 0x02)
+#define SSAM_EVENT_REGISTRY_REG(tid)\
+	SSAM_EVENT_REGISTRY(SSAM_SSH_TC_REG, tid, 0x01, 0x02)
 
 /**
  * enum ssam_event_notifier_flags - Flags for event notifiers.
-- 
cgit v1.2.3


From dd66f56caea6bb1a3703fb3bfc3106444d05a930 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Thu, 21 Oct 2021 08:55:24 +0200
Subject: dma-buf: fix kerneldoc for renamed members
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Those members where renamed, update the kerneldoc as well.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20211021141945.84023-1-christian.koenig@amd.com
---
 include/linux/dma-buf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 02c2eb874da6..9807aef33685 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -433,8 +433,8 @@ struct dma_buf {
 	/** @poll: for userspace poll support */
 	wait_queue_head_t poll;
 
-	/** @cb_excl: for userspace poll support */
-	/** @cb_shared: for userspace poll support */
+	/** @cb_in: for userspace poll support */
+	/** @cb_out: for userspace poll support */
 	struct dma_buf_poll_cb_t {
 		struct dma_fence_cb cb;
 		wait_queue_head_t *poll;
-- 
cgit v1.2.3


From 48d09e97876bed4bcc503d528bdba8c907e43cb3 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Thu, 21 Oct 2021 08:58:34 -0700
Subject: firmware_loader: formalize built-in firmware API

Formalize the built-in firmware with a proper API. This can later
be used by other callers where all they need is built-in firmware.

We export the firmware_request_builtin() call for now only
under the TEST_FIRMWARE symbol namespace as there are no
direct modular users for it. If they pop up they are free
to export it generally. Built-in code always gets access to
the callers and we'll demonstrate a hidden user which has been
lurking in the kernel for a while and the reason why using a
proper API was better long term.

Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/20211021155843.1969401-2-mcgrof@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index 25109192cebe..d743a8d1c2fe 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -20,12 +20,19 @@ struct firmware {
 struct module;
 struct device;
 
+/*
+ * Built-in firmware functionality is only available if FW_LOADER=y, but not
+ * FW_LOADER=m
+ */
+#ifdef CONFIG_FW_LOADER
 struct builtin_fw {
 	char *name;
 	void *data;
 	unsigned long size;
 };
 
+bool firmware_request_builtin(struct firmware *fw, const char *name);
+
 /* We have to play tricks here much like stringify() to get the
    __COUNTER__ macro to be expanded as we want it */
 #define __fw_concat1(x, y) x##y
@@ -38,6 +45,14 @@ struct builtin_fw {
 	static const struct builtin_fw __fw_concat(__builtin_fw,__COUNTER__) \
 	__used __section(".builtin_fw") = { name, blob, size }
 
+#else
+static inline bool firmware_request_builtin(struct firmware *fw,
+					    const char *name)
+{
+	return false;
+}
+#endif
+
 #if defined(CONFIG_FW_LOADER) || (defined(CONFIG_FW_LOADER_MODULE) && defined(MODULE))
 int request_firmware(const struct firmware **fw, const char *name,
 		     struct device *device);
-- 
cgit v1.2.3


From e520ecf4546fdaa7169ba75a35d24e2c53403a6e Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Thu, 21 Oct 2021 08:58:35 -0700
Subject: firmware_loader: remove old DECLARE_BUILTIN_FIRMWARE()

This was never used upstream. Time to get rid of it. We
don't carry around unused baggage.

Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/20211021155843.1969401-3-mcgrof@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index d743a8d1c2fe..34e8d5844fa0 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -32,19 +32,6 @@ struct builtin_fw {
 };
 
 bool firmware_request_builtin(struct firmware *fw, const char *name);
-
-/* We have to play tricks here much like stringify() to get the
-   __COUNTER__ macro to be expanded as we want it */
-#define __fw_concat1(x, y) x##y
-#define __fw_concat(x, y) __fw_concat1(x, y)
-
-#define DECLARE_BUILTIN_FIRMWARE(name, blob)				     \
-	DECLARE_BUILTIN_FIRMWARE_SIZE(name, &(blob), sizeof(blob))
-
-#define DECLARE_BUILTIN_FIRMWARE_SIZE(name, blob, size)			     \
-	static const struct builtin_fw __fw_concat(__builtin_fw,__COUNTER__) \
-	__used __section(".builtin_fw") = { name, blob, size }
-
 #else
 static inline bool firmware_request_builtin(struct firmware *fw,
 					    const char *name)
-- 
cgit v1.2.3


From e2e2c0f20f321b0ec36e8bde467259c0adf1fecb Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Thu, 21 Oct 2021 08:58:37 -0700
Subject: firmware_loader: move struct builtin_fw to the only place used

Now that x86 doesn't abuse picking at internals to the firmware
loader move out the built-in firmware struct to its only user.

Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/20211021155843.1969401-5-mcgrof@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index 34e8d5844fa0..3b057dfc8284 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -25,12 +25,6 @@ struct device;
  * FW_LOADER=m
  */
 #ifdef CONFIG_FW_LOADER
-struct builtin_fw {
-	char *name;
-	void *data;
-	unsigned long size;
-};
-
 bool firmware_request_builtin(struct firmware *fw, const char *name);
 #else
 static inline bool firmware_request_builtin(struct firmware *fw,
-- 
cgit v1.2.3


From 9208d414975895f69e9aca49153060ddd31b18d0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Oct 2021 08:06:01 +0200
Subject: block: add a ->get_unique_id method

Add a method to query unique IDs from block devices.  It will be used to
remove code that deeply pokes into SCSI internals in the NFS server.
The implementation in the sd driver itself is also much nicer as it can
use the cached VPD page instead of always sending a command as the
current NFS code does.

For now the interface is kept very minimal but could be easily
extended when other users like a block-layer sysfs interface for
uniquue IDs shows up.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20211021060607.264371-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f72ccb2829db..0d5826066e16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1158,6 +1158,14 @@ static inline void blk_crypto_unregister(struct request_queue *q) { }
 
 #endif /* CONFIG_BLK_INLINE_ENCRYPTION */
 
+enum blk_unique_id {
+	/* these match the Designator Types specified in SPC */
+	BLK_UID_T10	= 1,
+	BLK_UID_EUI64	= 2,
+	BLK_UID_NAA	= 3,
+};
+
+#define NFL4_UFLG_MASK			0x0000003F
 
 struct block_device_operations {
 	void (*submit_bio)(struct bio *bio);
@@ -1176,6 +1184,9 @@ struct block_device_operations {
 	int (*report_zones)(struct gendisk *, sector_t sector,
 			unsigned int nr_zones, report_zones_cb cb, void *data);
 	char *(*devnode)(struct gendisk *disk, umode_t *mode);
+	/* returns the length of the identifier or a negative errno: */
+	int (*get_unique_id)(struct gendisk *disk, u8 id[16],
+			enum blk_unique_id id_type);
 	struct module *owner;
 	const struct pr_ops *pr_ops;
 
-- 
cgit v1.2.3


From 4abafdc4360d993104c2b2f85943938a0c6ad025 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Oct 2021 08:06:06 +0200
Subject: block: remove the initialize_rq_fn blk_mq_ops method

Entirely unused now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20211021060607.264371-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b4039fdf1b04..ebc45cf0450b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -566,11 +566,6 @@ struct blk_mq_ops {
 	void (*exit_request)(struct blk_mq_tag_set *set, struct request *,
 			     unsigned int);
 
-	/**
-	 * @initialize_rq_fn: Called from inside blk_get_request().
-	 */
-	void (*initialize_rq_fn)(struct request *rq);
-
 	/**
 	 * @cleanup_rq: Called before freeing one request which isn't completed
 	 * yet, and usually for freeing the driver private data.
-- 
cgit v1.2.3


From 4845012eb5b4e56cadb5f484cb55dd4fd9d1df80 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Oct 2021 08:06:07 +0200
Subject: block: remove QUEUE_FLAG_SCSI_PASSTHROUGH

Export scsi_device_from_queue for use with pktcdvd and use that instead
of the otherwise unused QUEUE_FLAG_SCSI_PASSTHROUGH queue flag.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20211021060607.264371-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0d5826066e16..1ad30f85d30e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -357,7 +357,6 @@ struct request_queue {
 #define QUEUE_FLAG_STATS	20	/* track IO start and completion times */
 #define QUEUE_FLAG_POLL_STATS	21	/* collecting stats for hybrid polling */
 #define QUEUE_FLAG_REGISTERED	22	/* queue has been registered to a disk */
-#define QUEUE_FLAG_SCSI_PASSTHROUGH 23	/* queue supports SCSI commands */
 #define QUEUE_FLAG_QUIESCED	24	/* queue has been quiesced */
 #define QUEUE_FLAG_PCI_P2PDMA	25	/* device supports PCI p2p requests */
 #define QUEUE_FLAG_ZONE_RESETALL 26	/* supports Zone Reset All */
@@ -391,8 +390,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_secure_erase(q) \
 	(test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
 #define blk_queue_dax(q)	test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
-#define blk_queue_scsi_passthrough(q)	\
-	test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags)
 #define blk_queue_pci_p2pdma(q)	\
 	test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
-- 
cgit v1.2.3


From 70164eb6ccb76ab679b016b4b60123bf4ec6c162 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Oct 2021 08:25:25 +0200
Subject: block: remove __sync_blockdev

Instead offer a new sync_blockdev_nowait helper for the !wait case.
This new helper is exported as it will grow modular callers in a bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211019062530.2174626-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f72ccb2829db..4b5a6bbdacd0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1266,6 +1266,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
 #ifdef CONFIG_BLOCK
 void invalidate_bdev(struct block_device *bdev);
 int sync_blockdev(struct block_device *bdev);
+int sync_blockdev_nowait(struct block_device *bdev);
 #else
 static inline void invalidate_bdev(struct block_device *bdev)
 {
@@ -1274,6 +1275,10 @@ static inline int sync_blockdev(struct block_device *bdev)
 {
 	return 0;
 }
+static inline int sync_blockdev_nowait(struct block_device *bdev)
+{
+	return 0;
+}
 #endif
 int fsync_bdev(struct block_device *bdev);
 
-- 
cgit v1.2.3


From 1e03a36bdff4709c1bbf0f57f60ae3f776d51adf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Oct 2021 08:25:30 +0200
Subject: block: simplify the block device syncing code

Get rid of the indirections and just provide a sync_bdevs
helper for the generic sync code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211019062530.2174626-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4b5a6bbdacd0..09fdf7018b7f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1267,6 +1267,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
 void invalidate_bdev(struct block_device *bdev);
 int sync_blockdev(struct block_device *bdev);
 int sync_blockdev_nowait(struct block_device *bdev);
+void sync_bdevs(bool wait);
 #else
 static inline void invalidate_bdev(struct block_device *bdev)
 {
@@ -1279,6 +1280,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev)
 {
 	return 0;
 }
+static inline void sync_bdevs(bool wait)
+{
+}
 #endif
 int fsync_bdev(struct block_device *bdev);
 
-- 
cgit v1.2.3


From 599593a82fc57f5e9453c8ef7420df3206934a0c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 22 Oct 2021 19:35:45 -0600
Subject: sched: make task_struct->plug always defined
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_BLOCK isn't set, then it's an empty struct anyway. Just make
it generally available, so we don't break the compile:

kernel/sched/core.c: In function ‘sched_submit_work’:
kernel/sched/core.c:6346:35: error: ‘struct task_struct’ has no member named ‘plug’
 6346 |                 blk_flush_plug(tsk->plug, true);
      |                                   ^~
kernel/sched/core.c: In function ‘io_schedule_prepare’:
kernel/sched/core.c:8357:20: error: ‘struct task_struct’ has no member named ‘plug’
 8357 |         if (current->plug)
      |                    ^~
kernel/sched/core.c:8358:39: error: ‘struct task_struct’ has no member named ‘plug’
 8358 |                 blk_flush_plug(current->plug, true);
      |                                       ^~

Reported-by: Nathan Chancellor <nathan@kernel.org>
Fixes: 008f75a20e70 ("block: cleanup the flush plug helpers")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c1a927ddec64..e0454e60fe8f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1160,10 +1160,8 @@ struct task_struct {
 	/* Stacked block device info: */
 	struct bio_list			*bio_list;
 
-#ifdef CONFIG_BLOCK
 	/* Stack plugging: */
 	struct blk_plug			*plug;
-#endif
 
 	/* VM state: */
 	struct reclaim_state		*reclaim_state;
-- 
cgit v1.2.3


From 0ebecb2644c834d8a69f07c5d44a130835950171 Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@seco.com>
Date: Fri, 22 Oct 2021 11:59:12 -0400
Subject: net: mdio: Add helper functions for accessing MDIO devices

This adds some helpers for accessing non-phy MDIO devices. They are
analogous to phy_(read|write|modify), except that they take an mdio_device
and not a phy_device.

Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index f622888a4ba8..9f3587a61e14 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -352,6 +352,30 @@ int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask,
 int mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum,
 			   u16 mask, u16 set);
 
+static inline int mdiodev_read(struct mdio_device *mdiodev, u32 regnum)
+{
+	return mdiobus_read(mdiodev->bus, mdiodev->addr, regnum);
+}
+
+static inline int mdiodev_write(struct mdio_device *mdiodev, u32 regnum,
+				u16 val)
+{
+	return mdiobus_write(mdiodev->bus, mdiodev->addr, regnum, val);
+}
+
+static inline int mdiodev_modify(struct mdio_device *mdiodev, u32 regnum,
+				 u16 mask, u16 set)
+{
+	return mdiobus_modify(mdiodev->bus, mdiodev->addr, regnum, mask, set);
+}
+
+static inline int mdiodev_modify_changed(struct mdio_device *mdiodev,
+					 u32 regnum, u16 mask, u16 set)
+{
+	return mdiobus_modify_changed(mdiodev->bus, mdiodev->addr, regnum,
+				      mask, set);
+}
+
 static inline u32 mdiobus_c45_addr(int devad, u16 regnum)
 {
 	return MII_ADDR_C45 | devad << MII_DEVADDR_C45_SHIFT | regnum;
-- 
cgit v1.2.3


From 218f23e8a96fea7185dac68ceb3722d0831a8bcb Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 22 Oct 2021 09:17:01 -0700
Subject: net: phy: bcm7xxx: Add EPHY entry for 7712

7712 is a 16nm process SoC with a 10/100 integrated Ethernet PHY,
utilize the recently defined 16nm EPHY macro to configure that PHY.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 27d9b6683f0e..747fad264033 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -50,6 +50,7 @@
 #define PHY_ID_BCM7439			0x600d8480
 #define PHY_ID_BCM7439_2		0xae025080
 #define PHY_ID_BCM7445			0x600d8510
+#define PHY_ID_BCM7712			0x35905330
 
 #define PHY_ID_BCM_CYGNUS		0xae025200
 #define PHY_ID_BCM_OMEGA		0xae025100
-- 
cgit v1.2.3


From 97308f8b0d867e9ef59528cd97f0db55ffdf5651 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Fri, 23 Jul 2021 01:59:41 +0200
Subject: iomap: Support partial direct I/O on user copy failures

In iomap_dio_rw, when iomap_apply returns an -EFAULT error and the
IOMAP_DIO_PARTIAL flag is set, complete the request synchronously and
return a partial result.  This allows the caller to deal with the page
fault and retry the remainder of the request.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/iomap.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 24f8489583ca..2a213b0d1e1f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -330,6 +330,13 @@ struct iomap_dio_ops {
   */
 #define IOMAP_DIO_OVERWRITE_ONLY	(1 << 1)
 
+/*
+ * When a page fault occurs, return a partial synchronous result and allow
+ * the caller to retry the rest of the operation after dealing with the page
+ * fault.
+ */
+#define IOMAP_DIO_PARTIAL		(1 << 2)
+
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
 		unsigned int dio_flags);
-- 
cgit v1.2.3


From 4fdccaa0d184c202f98d73b24e3ec8eeee88ab8d Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sat, 24 Jul 2021 12:26:41 +0200
Subject: iomap: Add done_before argument to iomap_dio_rw

Add a done_before argument to iomap_dio_rw that indicates how much of
the request has already been transferred.  When the request succeeds, we
report that done_before additional bytes were tranferred.  This is
useful for finishing a request asynchronously when part of the request
has already been completed synchronously.

We'll use that to allow iomap_dio_rw to be used with page faults
disabled: when a page fault occurs while submitting a request, we
synchronously complete the part of the request that has already been
submitted.  The caller can then take care of the page fault and call
iomap_dio_rw again for the rest of the request, passing in the number of
bytes already tranferred.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/iomap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 2a213b0d1e1f..829f2325ecba 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -339,10 +339,10 @@ struct iomap_dio_ops {
 
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-		unsigned int dio_flags);
+		unsigned int dio_flags, size_t done_before);
 struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-		unsigned int dio_flags);
+		unsigned int dio_flags, size_t done_before);
 ssize_t iomap_dio_complete(struct iomap_dio *dio);
 int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
 
-- 
cgit v1.2.3


From 55b8fe703bc51200d4698596c90813453b35ae63 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 17 Aug 2021 22:52:08 +0200
Subject: gup: Introduce FOLL_NOFAULT flag to disable page faults

Introduce a new FOLL_NOFAULT flag that causes get_user_pages to return
-EFAULT when it would otherwise trigger a page fault.  This is roughly
similar to FOLL_FAST_ONLY but available on all architectures, and less
fragile.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 include/linux/mm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 73a52aba448f..2f0e6b9f8f3b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2851,7 +2851,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_FORCE	0x10	/* get_user_pages read/write w/o permission */
 #define FOLL_NOWAIT	0x20	/* if a disk transfer is needed, start the IO
 				 * and return without waiting upon it */
-#define FOLL_POPULATE	0x40	/* fault in page */
+#define FOLL_POPULATE	0x40	/* fault in pages (with FOLL_MLOCK) */
+#define FOLL_NOFAULT	0x80	/* do not fault in pages */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
 #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
-- 
cgit v1.2.3


From 3337ab08d08b1a375f88471d9c8b1cac968cb054 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 12 Jul 2021 12:06:14 +0200
Subject: iov_iter: Introduce nofault flag to disable page faults

Introduce a new nofault flag to indicate to iov_iter_get_pages not to
fault in user pages.

This is implemented by passing the FOLL_NOFAULT flag to get_user_pages,
which causes get_user_pages to fail when it would otherwise fault in a
page. We'll use the ->nofault flag to prevent iomap_dio_rw from faulting
in pages when page faults are not allowed.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 include/linux/uio.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 25d1c24fd829..6350354f97e9 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -35,6 +35,7 @@ struct iov_iter_state {
 
 struct iov_iter {
 	u8 iter_type;
+	bool nofault;
 	bool data_source;
 	size_t iov_offset;
 	size_t count;
-- 
cgit v1.2.3


From 63dfe0709643528290c8a6825f278eda0e3f3c2e Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Sat, 18 Sep 2021 18:56:32 +0900
Subject: can: bittiming: allow TDC{V,O} to be zero and add
 can_tdc_const::tdc{v,o,f}_min

ISO 11898-1 specifies in section 11.3.3 "Transmitter delay
compensation" that "the configuration range for [the] SSP position
shall be at least 0 to 63 minimum time quanta."

Because SSP = TDCV + TDCO, it means that we should allow both TDCV and
TDCO to hold zero value in order to honor SSP's minimum possible
value.

However, current implementation assigned special meaning to TDCV and
TDCO's zero values:
  * TDCV = 0 -> TDCV is automatically measured by the transceiver.
  * TDCO = 0 -> TDC is off.

In order to allow for those values to really be zero and to maintain
current features, we introduce two new flags:
  * CAN_CTRLMODE_TDC_AUTO indicates that the controller support
    automatic measurement of TDCV.
  * CAN_CTRLMODE_TDC_MANUAL indicates that the controller support
    manual configuration of TDCV. N.B.: current implementation failed
    to provide an option for the driver to indicate that only manual
    mode was supported.

TDC is disabled if both CAN_CTRLMODE_TDC_AUTO and
CAN_CTRLMODE_TDC_MANUAL flags are off, c.f. the helper function
can_tdc_is_enabled() which is also introduced in this patch.

Also, this patch adds three fields: tdcv_min, tdco_min and tdcf_min to
struct can_tdc_const. While we are not convinced that those three
fields could be anything else than zero, we can imagine that some
controllers might specify a lower bound on these. Thus, those minimums
are really added "just in case".

Comments of struct can_tdc and can_tdc_const are updated accordingly.

Finally, the changes are applied to the etas_es58x driver.

Link: https://lore.kernel.org/all/20210918095637.20108-2-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/bittiming.h | 64 +++++++++++++++++++++++++++++++------------
 include/linux/can/dev.h       |  4 +++
 2 files changed, 51 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index 9de6e9053e34..9e20260611cc 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -19,6 +19,9 @@
 /* Megahertz */
 #define CAN_MHZ 1000000UL
 
+#define CAN_CTRLMODE_TDC_MASK					\
+	(CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_TDC_MANUAL)
+
 /*
  * struct can_tdc - CAN FD Transmission Delay Compensation parameters
  *
@@ -33,29 +36,43 @@
  *
  * This structure contains the parameters to calculate that SSP.
  *
- * @tdcv: Transmitter Delay Compensation Value. Distance, in time
- *	quanta, from when the bit is sent on the TX pin to when it is
- *	received on the RX pin of the transmitter. Possible options:
+ * -+----------- one bit ----------+-- TX pin
+ *  |<--- Sample Point --->|
+ *
+ *                         --+----------- one bit ----------+-- RX pin
+ *  |<-------- TDCV -------->|
+ *                           |<------- TDCO ------->|
+ *  |<----------- Secondary Sample Point ---------->|
+ *
+ * @tdcv: Transmitter Delay Compensation Value. The time needed for
+ *	the signal to propagate, i.e. the distance, in time quanta,
+ *	from the start of the bit on the TX pin to when it is received
+ *	on the RX pin. @tdcv depends on the controller modes:
+ *
+ *	  CAN_CTRLMODE_TDC_AUTO is set: The transceiver dynamically
+ *	  measures @tdcv for each transmitted CAN FD frame and the
+ *	  value provided here should be ignored.
  *
- *	  0: automatic mode. The controller dynamically measures @tdcv
- *	  for each transmitted CAN FD frame.
+ *	  CAN_CTRLMODE_TDC_MANUAL is set: use the fixed provided @tdcv
+ *	  value.
  *
- *	  Other values: manual mode. Use the fixed provided value.
+ *	N.B. CAN_CTRLMODE_TDC_AUTO and CAN_CTRLMODE_TDC_MANUAL are
+ *	mutually exclusive. Only one can be set at a time. If both
+ *	CAN_TDC_CTRLMODE_AUTO and CAN_TDC_CTRLMODE_MANUAL are unset,
+ *	TDC is disabled and all the values of this structure should be
+ *	ignored.
  *
  * @tdco: Transmitter Delay Compensation Offset. Offset value, in time
  *	quanta, defining the distance between the start of the bit
  *	reception on the RX pin of the transceiver and the SSP
  *	position such that SSP = @tdcv + @tdco.
  *
- *	If @tdco is zero, then TDC is disabled and both @tdcv and
- *	@tdcf should be ignored.
- *
  * @tdcf: Transmitter Delay Compensation Filter window. Defines the
- *	minimum value for the SSP position in time quanta. If SSP is
- *	less than @tdcf, then no delay compensations occur and the
- *	normal sampling point is used instead. The feature is enabled
- *	if and only if @tdcv is set to zero (automatic mode) and @tdcf
- *	is configured to a value greater than @tdco.
+ *	minimum value for the SSP position in time quanta. If the SSP
+ *	position is less than @tdcf, then no delay compensations occur
+ *	and the normal sampling point is used instead. The feature is
+ *	enabled if and only if @tdcv is set to zero (automatic mode)
+ *	and @tdcf is configured to a value greater than @tdco.
  */
 struct can_tdc {
 	u32 tdcv;
@@ -67,19 +84,32 @@ struct can_tdc {
  * struct can_tdc_const - CAN hardware-dependent constant for
  *	Transmission Delay Compensation
  *
- * @tdcv_max: Transmitter Delay Compensation Value maximum value.
- *	Should be set to zero if the controller does not support
- *	manual mode for tdcv.
+ * @tdcv_min: Transmitter Delay Compensation Value minimum value. If
+ *	the controller does not support manual mode for tdcv
+ *	(c.f. flag CAN_CTRLMODE_TDC_MANUAL) then this value is
+ *	ignored.
+ * @tdcv_max: Transmitter Delay Compensation Value maximum value. If
+ *	the controller does not support manual mode for tdcv
+ *	(c.f. flag CAN_CTRLMODE_TDC_MANUAL) then this value is
+ *	ignored.
+ *
+ * @tdco_min: Transmitter Delay Compensation Offset minimum value.
  * @tdco_max: Transmitter Delay Compensation Offset maximum value.
  *	Should not be zero. If the controller does not support TDC,
  *	then the pointer to this structure should be NULL.
+ *
+ * @tdcf_min: Transmitter Delay Compensation Filter window minimum
+ *	value. If @tdcf_max is zero, this value is ignored.
  * @tdcf_max: Transmitter Delay Compensation Filter window maximum
  *	value. Should be set to zero if the controller does not
  *	support this feature.
  */
 struct can_tdc_const {
+	u32 tdcv_min;
 	u32 tdcv_max;
+	u32 tdco_min;
 	u32 tdco_max;
+	u32 tdcf_min;
 	u32 tdcf_max;
 };
 
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 2413253e54c7..6dacbbb41e68 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -96,6 +96,10 @@ struct can_priv {
 #endif
 };
 
+static inline bool can_tdc_is_enabled(const struct can_priv *priv)
+{
+	return !!(priv->ctrlmode & CAN_CTRLMODE_TDC_MASK);
+}
 
 /* helper to define static CAN controller features at device creation time */
 static inline void can_set_static_ctrlmode(struct net_device *dev,
-- 
cgit v1.2.3


From 39f66c9e229797a58a12ea78388cbbad1f81aec9 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Sat, 18 Sep 2021 18:56:33 +0900
Subject: can: bittiming: change unit of TDC parameters to clock periods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the current implementation, all Transmission Delay Compensation
(TDC) parameters are expressed in time quantum. However, ISO 11898-1
actually specifies that these should be expressed in *minimum* time
quantum.

Furthermore, the minimum time quantum is specified to be "one node
clock period long" (c.f. paragraph 11.3.1.1 "Bit time"). For sake of
simplicity, we prefer to use the "clock period" term instead of
"minimum time quantum" because we believe that it is more broadly
understood.

This patch fixes that discrepancy by updating the documentation and
the formula for TDCO calculation.

N.B. In can_calc_tdco(), the sample point (in time quantum) was
calculated using a division, thus introducing a risk of rounding and
truncation errors. On top of changing the unit to clock period, we
also modified the formula to use only additions.

Link: https://lore.kernel.org/all/20210918095637.20108-3-mailhol.vincent@wanadoo.fr
Suggested-by: Stefan Mätje <Stefan.Maetje@esd.eu>
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/bittiming.h | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index 9e20260611cc..aebbe65dab7e 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -31,8 +31,8 @@
  *
  * To solve this issue, ISO 11898-1 introduces in section 11.3.3
  * "Transmitter delay compensation" a SSP (Secondary Sample Point)
- * equal to the distance, in time quanta, from the start of the bit
- * time on the TX pin to the actual measurement on the RX pin.
+ * equal to the distance from the start of the bit time on the TX pin
+ * to the actual measurement on the RX pin.
  *
  * This structure contains the parameters to calculate that SSP.
  *
@@ -44,8 +44,13 @@
  *                           |<------- TDCO ------->|
  *  |<----------- Secondary Sample Point ---------->|
  *
+ * To increase precision, contrary to the other bittiming parameters
+ * which are measured in time quanta, the TDC parameters are measured
+ * in clock periods (also referred as "minimum time quantum" in ISO
+ * 11898-1).
+ *
  * @tdcv: Transmitter Delay Compensation Value. The time needed for
- *	the signal to propagate, i.e. the distance, in time quanta,
+ *	the signal to propagate, i.e. the distance, in clock periods,
  *	from the start of the bit on the TX pin to when it is received
  *	on the RX pin. @tdcv depends on the controller modes:
  *
@@ -62,17 +67,18 @@
  *	TDC is disabled and all the values of this structure should be
  *	ignored.
  *
- * @tdco: Transmitter Delay Compensation Offset. Offset value, in time
- *	quanta, defining the distance between the start of the bit
- *	reception on the RX pin of the transceiver and the SSP
+ * @tdco: Transmitter Delay Compensation Offset. Offset value, in
+ *	clock periods, defining the distance between the start of the
+ *	bit reception on the RX pin of the transceiver and the SSP
  *	position such that SSP = @tdcv + @tdco.
  *
  * @tdcf: Transmitter Delay Compensation Filter window. Defines the
- *	minimum value for the SSP position in time quanta. If the SSP
- *	position is less than @tdcf, then no delay compensations occur
- *	and the normal sampling point is used instead. The feature is
- *	enabled if and only if @tdcv is set to zero (automatic mode)
- *	and @tdcf is configured to a value greater than @tdco.
+ *	minimum value for the SSP position in clock periods. If the
+ *	SSP position is less than @tdcf, then no delay compensations
+ *	occur and the normal sampling point is used instead. The
+ *	feature is enabled if and only if @tdcv is set to zero
+ *	(automatic mode) and @tdcf is configured to a value greater
+ *	than @tdco.
  */
 struct can_tdc {
 	u32 tdcv;
-- 
cgit v1.2.3


From da45a1e4d7b9d6b5a8231acb812df719fe3228b4 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Sat, 18 Sep 2021 18:56:34 +0900
Subject: can: bittiming: change can_calc_tdco()'s prototype to not directly
 modify priv

The function can_calc_tdco() directly retrieves can_priv from the
net_device and directly modifies it.

This is annoying for the upcoming patch. In
drivers/net/can/dev/netlink.c:can_changelink(), the data bittiming are
written to a temporary structure and memcpyed to can_priv only after
everything succeeded. In the next patch, where we will introduce the
netlink interface for TDC parameters, we will add a new TDC block
which can potentially fail. For this reason, the data bittiming
temporary structure has to be copied after that to-be-introduced TDC
block. However, TDC also needs to access data bittiming information.

We change the prototype so that the data bittiming structure is passed
to can_calc_tdco() as an argument instead of retrieving it from
priv. This way can_calc_tdco() can access the data bittiming before it
gets memcpyed to priv.

Link: https://lore.kernel.org/all/20210918095637.20108-4-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/bittiming.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index aebbe65dab7e..20b50baf3a02 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -123,7 +123,9 @@ struct can_tdc_const {
 int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
 		       const struct can_bittiming_const *btc);
 
-void can_calc_tdco(struct net_device *dev);
+void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const,
+		   const struct can_bittiming *dbt,
+		   u32 *ctrlmode, u32 ctrlmode_supported);
 #else /* !CONFIG_CAN_CALC_BITTIMING */
 static inline int
 can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
@@ -133,7 +135,10 @@ can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
 	return -EINVAL;
 }
 
-static inline void can_calc_tdco(struct net_device *dev)
+static inline void
+can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const,
+	      const struct can_bittiming *dbt,
+	      u32 *ctrlmode, u32 ctrlmode_supported)
 {
 }
 #endif /* CONFIG_CAN_CALC_BITTIMING */
-- 
cgit v1.2.3


From e8060f08cd69d1d692cfb9f0a2808477a501f35a Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Sat, 18 Sep 2021 18:56:36 +0900
Subject: can: netlink: add can_priv::do_get_auto_tdcv() to retrieve tdcv from
 device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some CAN device can measure the TDCV (Transmission Delay Compensation
Value) automatically for each transmitted CAN frames.

A callback function do_get_auto_tdcv() is added to retrieve that
value. This function is used only if CAN_CTRLMODE_TDC_AUTO is enabled
(if CAN_CTRLMODE_TDC_MANUAL is selected, the TDCV value is provided by
the user).

If the device does not support reporting of TDCV, do_get_auto_tdcv()
should be set to NULL and TDCV will not be reported by the netlink
interface.

On success, do_get_auto_tdcv() shall return 0. If the value can not be
measured by the device, for example because network is down or because
no frames were transmitted yet, can_priv::do_get_auto_tdcv() shall
return a negative error code (e.g. -EINVAL) to signify that the value
is not yet available. In such cases, TDCV is not reported by the
netlink interface.

Link: https://lore.kernel.org/all/20210918095637.20108-6-mailhol.vincent@wanadoo.fr
CC: Stefan Mätje <stefan.maetje@esd.eu>
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 6dacbbb41e68..b4aa0f048cab 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -82,6 +82,7 @@ struct can_priv {
 			    enum can_state *state);
 	int (*do_get_berr_counter)(const struct net_device *dev,
 				   struct can_berr_counter *bec);
+	int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv);
 
 	unsigned int echo_skb_max;
 	struct sk_buff **echo_skb;
-- 
cgit v1.2.3


From fa759a9395ea81c17db613dde43c46f0607df7e7 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Sat, 18 Sep 2021 18:56:37 +0900
Subject: can: dev: add can_tdc_get_relative_tdco() helper function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct can_tdc::tdco represents the absolute offset from TDCV. Some
controllers use instead an offset relative to the Sample Point (SP)
such that:
| SSP = TDCV + absolute TDCO
|     = TDCV + SP + relative TDCO

Consequently:
| relative TDCO = absolute TDCO - SP

The function can_tdc_get_relative_tdco() allow to retrieve this
relative TDCO value.

Link: https://lore.kernel.org/all/20210918095637.20108-7-mailhol.vincent@wanadoo.fr
CC: Stefan Mätje <Stefan.Maetje@esd.eu>
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index b4aa0f048cab..45f19d9db5ca 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -102,6 +102,35 @@ static inline bool can_tdc_is_enabled(const struct can_priv *priv)
 	return !!(priv->ctrlmode & CAN_CTRLMODE_TDC_MASK);
 }
 
+/*
+ * can_get_relative_tdco() - TDCO relative to the sample point
+ *
+ * struct can_tdc::tdco represents the absolute offset from TDCV. Some
+ * controllers use instead an offset relative to the Sample Point (SP)
+ * such that:
+ *
+ * SSP = TDCV + absolute TDCO
+ *     = TDCV + SP + relative TDCO
+ *
+ * -+----------- one bit ----------+-- TX pin
+ *  |<--- Sample Point --->|
+ *
+ *                         --+----------- one bit ----------+-- RX pin
+ *  |<-------- TDCV -------->|
+ *                           |<------------------------>| absolute TDCO
+ *                           |<--- Sample Point --->|
+ *                           |                      |<->| relative TDCO
+ *  |<------------- Secondary Sample Point ------------>|
+ */
+static inline s32 can_get_relative_tdco(const struct can_priv *priv)
+{
+	const struct can_bittiming *dbt = &priv->data_bittiming;
+	s32 sample_point_in_tc = (CAN_SYNC_SEG + dbt->prop_seg +
+				  dbt->phase_seg1) * dbt->brp;
+
+	return (s32)priv->tdc.tdco - sample_point_in_tc;
+}
+
 /* helper to define static CAN controller features at device creation time */
 static inline void can_set_static_ctrlmode(struct net_device *dev,
 					   u32 static_mode)
-- 
cgit v1.2.3


From b3b180e735409ca0c76642014304b59482e0e653 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 20 Sep 2021 14:20:07 +0200
Subject: dmaengine: remove debugfs #ifdef

The ptdma driver has added debugfs support, but this fails to build
when debugfs is disabled:

drivers/dma/ptdma/ptdma-debugfs.c: In function 'ptdma_debugfs_setup':
drivers/dma/ptdma/ptdma-debugfs.c:93:54: error: 'struct dma_device' has no member named 'dbg_dev_root'
   93 |         debugfs_create_file("info", 0400, pt->dma_dev.dbg_dev_root, pt,
      |                                                      ^
drivers/dma/ptdma/ptdma-debugfs.c:96:55: error: 'struct dma_device' has no member named 'dbg_dev_root'
   96 |         debugfs_create_file("stats", 0400, pt->dma_dev.dbg_dev_root, pt,
      |                                                       ^
drivers/dma/ptdma/ptdma-debugfs.c:102:52: error: 'struct dma_device' has no member named 'dbg_dev_root'
  102 |                 debugfs_create_dir("q", pt->dma_dev.dbg_dev_root);
      |                                                    ^

Remove the #ifdef in the header, as this only saves a few bytes,
but would require ugly #ifdefs in each driver using it.
Simplify the other user while we're at it.

Fixes: e2fb2e2a33fa ("dmaengine: ptdma: Add debugfs entries for PTDMA")
Fixes: 26cf132de6f7 ("dmaengine: Create debug directories for DMA devices")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Link: https://lore.kernel.org/r/20210920122017.205975-1-arnd@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index e5c2c9e71bf1..9000f3ffce8b 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -944,10 +944,8 @@ struct dma_device {
 	void (*device_issue_pending)(struct dma_chan *chan);
 	void (*device_release)(struct dma_device *dev);
 	/* debugfs support */
-#ifdef CONFIG_DEBUG_FS
 	void (*dbg_summary_show)(struct seq_file *s, struct dma_device *dev);
 	struct dentry *dbg_dev_root;
-#endif
 };
 
 static inline int dmaengine_slave_config(struct dma_chan *chan,
-- 
cgit v1.2.3


From 1ba5478270a5c3a02b08052a3c003c282f2db94a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 22 Oct 2021 17:49:21 +0200
Subject: irqchip: Fix compile-testing without CONFIG_OF

Drivers using the new IRQCHIP_PLATFORM_DRIVER_BEGIN helper
fail to link when compile-testing without CONFIG_OF,
as that means CONFIG_IRQCHIP is disabled as well:

ld.lld: error: undefined symbol: platform_irqchip_probe
>>> referenced by irq-meson-gpio.c
>>>               irqchip/irq-meson-gpio.o:(meson_gpio_intc_driver) in archive drivers/built-in.a
>>> referenced by irq-mchp-eic.c
>>>               irqchip/irq-mchp-eic.o:(mchp_eic_driver) in archive drivers/built-in.a

As the drivers are not actually used in this case, just
making the reference to this symbol conditional helps
avoid the link failure.

Fixes: f8410e626569 ("irqchip: Add IRQCHIP_PLATFORM_DRIVER_BEGIN/END and IRQCHIP_MATCH helper macros")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211022154927.920491-1-arnd@kernel.org
---
 include/linux/irqchip.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h
index 67351aac65ef..29dbe675fc7a 100644
--- a/include/linux/irqchip.h
+++ b/include/linux/irqchip.h
@@ -39,8 +39,9 @@ static const struct of_device_id drv_name##_irqchip_match_table[] = {
 	{},								\
 };									\
 MODULE_DEVICE_TABLE(of, drv_name##_irqchip_match_table);		\
-static struct platform_driver drv_name##_driver = {		\
-	.probe  = platform_irqchip_probe,				\
+static struct platform_driver drv_name##_driver = {			\
+	.probe  = IS_ENABLED(CONFIG_IRQCHIP) ? 				\
+			platform_irqchip_probe : NULL,			\
 	.driver = {							\
 		.name = #drv_name,					\
 		.owner = THIS_MODULE,					\
-- 
cgit v1.2.3


From f2739ca15c414ebad88f4333e3186fd4144c1753 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 22 Oct 2021 11:46:42 -0500
Subject: x86/of: Kill unused early_init_dt_scan_chosen_arch()

There are no callers for early_init_dt_scan_chosen_arch(), so remove it.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
Link: https://lkml.kernel.org/r/20211022164642.2815706-1-robh@kernel.org
---
 include/linux/of_fdt.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index cf6a65b94d40..cf48983d3c86 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -65,7 +65,6 @@ extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
 extern int early_init_dt_scan_chosen_stdout(void);
 extern void early_init_fdt_scan_reserved_mem(void);
 extern void early_init_fdt_reserve_self(void);
-extern void __init early_init_dt_scan_chosen_arch(unsigned long node);
 extern void early_init_dt_add_memory_arch(u64 base, u64 size);
 extern u64 dt_mem_next_cell(int s, const __be32 **cellp);
 
-- 
cgit v1.2.3


From a1b09501971435ef213251891753afb0d7f3d27a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 20 Oct 2021 11:24:06 +0100
Subject: irq: add generic_handle_arch_irq()

Several architectures select GENERIC_IRQ_MULTI_HANDLER and branch to
handle_arch_irq() without performing any entry accounting.

Add a generic wrapper to handle the common irqentry work when invoking
handle_arch_irq(). Where an architecture needs to perform some entry
accounting itself, it will need to invoke handle_arch_irq() itself.

In subsequent patches it will become the responsibilty of the entry code
to set the irq regs when entering an IRQ (rather than deferring this to
an irqchip handler), so generic_handle_arch_irq() is made to set the irq
regs now. This can be redundant in some cases, but is never harmful as
saving/restoring the old regs nests safely.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Guo Ren <guoren@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c8293c817646..988c225eef2d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -1261,6 +1261,7 @@ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *));
  * top-level IRQ handler.
  */
 extern void (*handle_arch_irq)(struct pt_regs *) __ro_after_init;
+asmlinkage void generic_handle_arch_irq(struct pt_regs *regs);
 #else
 #ifndef set_handle_irq
 #define set_handle_irq(handle_irq)		\
-- 
cgit v1.2.3


From 63c67f526db86d3102a77437a510c949f6debb08 Mon Sep 17 00:00:00 2001
From: Luo Jie <luoj@codeaurora.org>
Date: Sun, 24 Oct 2021 16:27:34 +0800
Subject: net: phy: add genphy_c45_fast_retrain

Add generic fast retrain auto-negotiation function for C45 PHYs.

Signed-off-by: Luo Jie <luoj@codeaurora.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 736e1d1a47c4..04e90423fa88 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1584,6 +1584,7 @@ int genphy_c45_config_aneg(struct phy_device *phydev);
 int genphy_c45_loopback(struct phy_device *phydev, bool enable);
 int genphy_c45_pma_resume(struct phy_device *phydev);
 int genphy_c45_pma_suspend(struct phy_device *phydev);
+int genphy_c45_fast_retrain(struct phy_device *phydev, bool enable);
 
 /* Generic C45 PHY driver */
 extern struct phy_driver genphy_c45_driver;
-- 
cgit v1.2.3


From 1705643faecde95bdeb11bea5ab5baed084e9f91 Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Tue, 19 Oct 2021 05:30:20 +0800
Subject: mmc: add MT7921 SDIO identifiers for MediaTek Bluetooth devices

The MT7961 SDIO identifier for MediaTek Bluetooth devices were being
referred in the MediaTek Bluetooth driver.

Co-developed-by: Mark-yw Chen <mark-yw.chen@mediatek.com>
Signed-off-by: Mark-yw Chen <mark-yw.chen@mediatek.com>
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/mmc/sdio_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index a85c9f0bd470..53f0efa0bccf 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -105,6 +105,7 @@
 #define SDIO_VENDOR_ID_MEDIATEK			0x037a
 #define SDIO_DEVICE_ID_MEDIATEK_MT7663		0x7663
 #define SDIO_DEVICE_ID_MEDIATEK_MT7668		0x7668
+#define SDIO_DEVICE_ID_MEDIATEK_MT7961		0x7961
 
 #define SDIO_VENDOR_ID_MICROCHIP_WILC		0x0296
 #define SDIO_DEVICE_ID_MICROCHIP_WILC1000	0x5347
-- 
cgit v1.2.3


From 6b19b766e8f077f29cdb47da5003469a85bbfb9c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 21 Oct 2021 09:22:35 -0600
Subject: fs: get rid of the res2 iocb->ki_complete argument

The second argument was only used by the USB gadget code, yet everyone
pays the overhead of passing a zero to be passed into aio, where it
ends up being part of the aio res2 value.

Now that everybody is passing in zero, kill off the extra argument.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 31029a91f440..0dcb9020a7b3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -330,7 +330,7 @@ struct kiocb {
 	randomized_struct_fields_start
 
 	loff_t			ki_pos;
-	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
+	void (*ki_complete)(struct kiocb *iocb, long ret);
 	void			*private;
 	int			ki_flags;
 	u16			ki_hint;
-- 
cgit v1.2.3


From cb464ba53c0cb497dcb4a3daaf4fad4b75291863 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Mon, 11 Oct 2021 13:14:28 +0300
Subject: net/mlx5: Extend health buffer dump

Enhance health buffer to include:
 - assert_var5: expose the 6'th assert variable.
 - time: error's time-stamp in seconds (epoch time).
 - rfr: Recovery Flow Requiered. When set, indicates that the error
        cannot be recovered without flow involving reset.
 - severity: error's severity value, ranging from emergency to debug.
Expose them in the health buffer dump (dmesg and devlink fw reporter).

Health buffer in dmesg:
mlx5_core 0000:08:00.0: print_health_info:425:(pid 912): Health issue observed, firmware internal error, severity(3) ERROR:
mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[0] 0x08040700
mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[1] 0x00000000
mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[2] 0x00000000
mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[3] 0x00000000
mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[4] 0x00000000
mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[5] 0x00000000
mlx5_core 0000:08:00.0: print_health_info:432:(pid 912): assert_exit_ptr 0x00aaf800
mlx5_core 0000:08:00.0: print_health_info:434:(pid 912): assert_callra 0x00aaf70c
mlx5_core 0000:08:00.0: print_health_info:436:(pid 912): fw_ver 16.32.492
mlx5_core 0000:08:00.0: print_health_info:437:(pid 912): time 1634819758
mlx5_core 0000:08:00.0: print_health_info:438:(pid 912): hw_id 0x0000020d
mlx5_core 0000:08:00.0: print_health_info:439:(pid 912): rfr 0
mlx5_core 0000:08:00.0: print_health_info:440:(pid 912): severity 3 (ERROR)
mlx5_core 0000:08:00.0: print_health_info:441:(pid 912): irisc_index 9
mlx5_core 0000:08:00.0: print_health_info:442:(pid 912): synd 0x1: firmware internal error
mlx5_core 0000:08:00.0: print_health_info:444:(pid 912): ext_synd 0x802b
mlx5_core 0000:08:00.0: print_health_info:445:(pid 912): raw fw_ver 0x102001ec

Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h   | 14 ++++++++------
 include/linux/mlx5/mlx5_ifc.h | 10 ++++++++--
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 347167c18802..f8a0bbb42c3b 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -541,19 +541,21 @@ struct mlx5_cmd_layout {
 	u8		status_own;
 };
 
-enum mlx5_fatal_assert_bit_offsets {
-	MLX5_RFR_OFFSET = 31,
+enum mlx5_rfr_severity_bit_offsets {
+	MLX5_RFR_BIT_OFFSET = 0x7,
 };
 
 struct health_buffer {
-	__be32		assert_var[5];
-	__be32		rsvd0[3];
+	__be32		assert_var[6];
+	__be32		rsvd0[2];
 	__be32		assert_exit_ptr;
 	__be32		assert_callra;
-	__be32		rsvd1[2];
+	__be32		rsvd1[1];
+	__be32		time;
 	__be32		fw_ver;
 	__be32		hw_id;
-	__be32		rfr;
+	u8		rfr_severity;
+	u8		rsvd2[3];
 	u8		irisc_index;
 	u8		synd;
 	__be16		ext_synd;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 09e43019d877..6d292b5b8992 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4149,13 +4149,19 @@ struct mlx5_ifc_health_buffer_bits {
 
 	u8         assert_callra[0x20];
 
-	u8         reserved_at_140[0x40];
+	u8         reserved_at_140[0x20];
+
+	u8         time[0x20];
 
 	u8         fw_version[0x20];
 
 	u8         hw_id[0x20];
 
-	u8         reserved_at_1c0[0x20];
+	u8         rfr[0x1];
+	u8         reserved_at_1c1[0x3];
+	u8         valid[0x1];
+	u8         severity[0x3];
+	u8         reserved_at_1c8[0x18];
 
 	u8         irisc_index[0x8];
 	u8         synd[0x8];
-- 
cgit v1.2.3


From 5a1023deeed02a2078bcc11eec1c4be31e85892d Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Wed, 13 Oct 2021 09:45:22 +0300
Subject: net/mlx5: Add periodic update of host time to firmware

Firmware logs its asserts also to non-volatile memory. In order to
reduce drift between the NIC and the host, the driver sets the host
epoch-time to the firmware every hour.

Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h   |  2 ++
 include/linux/mlx5/mlx5_ifc.h | 12 ++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3f4c0f2314a5..f617dfbcd9fd 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -134,6 +134,7 @@ enum {
 	MLX5_REG_MCIA		 = 0x9014,
 	MLX5_REG_MFRL		 = 0x9028,
 	MLX5_REG_MLCR		 = 0x902b,
+	MLX5_REG_MRTC		 = 0x902d,
 	MLX5_REG_MTRC_CAP	 = 0x9040,
 	MLX5_REG_MTRC_CONF	 = 0x9041,
 	MLX5_REG_MTRC_STDB	 = 0x9042,
@@ -440,6 +441,7 @@ struct mlx5_core_health {
 	struct work_struct		report_work;
 	struct devlink_health_reporter *fw_reporter;
 	struct devlink_health_reporter *fw_fatal_reporter;
+	struct delayed_work		update_fw_log_ts_work;
 };
 
 struct mlx5_qp_table {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6d292b5b8992..746381eccccf 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10358,6 +10358,17 @@ struct mlx5_ifc_pddr_reg_bits {
 	union mlx5_ifc_pddr_reg_page_data_auto_bits page_data;
 };
 
+struct mlx5_ifc_mrtc_reg_bits {
+	u8         time_synced[0x1];
+	u8         reserved_at_1[0x1f];
+
+	u8         reserved_at_20[0x20];
+
+	u8         time_h[0x20];
+
+	u8         time_l[0x20];
+};
+
 union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_bufferx_reg_bits bufferx_reg;
 	struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
@@ -10419,6 +10430,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_mirc_reg_bits mirc_reg;
 	struct mlx5_ifc_mfrl_reg_bits mfrl_reg;
 	struct mlx5_ifc_mtutc_reg_bits mtutc_reg;
+	struct mlx5_ifc_mrtc_reg_bits mrtc_reg;
 	u8         reserved_at_0[0x60e0];
 };
 
-- 
cgit v1.2.3


From 46ae40b94d8826591472798114a723cc7feac7a7 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Thu, 12 Aug 2021 11:53:34 +0300
Subject: net/mlx5: Let user configure io_eq_size param

Currently, each I/O EQ is taking 128KB of memory. This size
is not needed in all use cases, and is critical with large scale.
Hence, allow user to configure the size of I/O EQs.

For example, to reduce I/O EQ size to 64, execute:
$ devlink resource set pci/0000:00:0b.0 path /io_eq_size/ size 64
$ devlink dev reload pci/0000:00:0b.0

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f617dfbcd9fd..47c07f95bbe1 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -797,10 +797,6 @@ struct mlx5_db {
 	int			index;
 };
 
-enum {
-	MLX5_COMP_EQ_SIZE = 1024,
-};
-
 enum {
 	MLX5_PTYS_IB = 1 << 0,
 	MLX5_PTYS_EN = 1 << 2,
-- 
cgit v1.2.3


From a6cb08daa3b459e3dab1d98c67cb8c931f4d81a5 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Wed, 13 Oct 2021 09:57:54 +0300
Subject: net/mlx5: Let user configure event_eq_size param

Event EQ is an EQ which received the notification of almost all the
events generated by the NIC.
Currently, each event EQ is taking 512KB of memory. This size is not
needed in most use cases, and is critical with large scale. Hence,
allow user to configure the size of the event EQ.

For example to reduce event EQ size to 64, execute::
$ devlink resource set pci/0000:00:0b.0 path /event_eq_size/ size 64
$ devlink dev reload pci/0000:00:0b.0

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/eq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index ea3ff5a8ced3..11161e427608 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -5,7 +5,6 @@
 #define MLX5_CORE_EQ_H
 
 #define MLX5_NUM_CMD_EQE   (32)
-#define MLX5_NUM_ASYNC_EQE (0x1000)
 #define MLX5_NUM_SPARE_EQE (0x80)
 
 struct mlx5_eq;
-- 
cgit v1.2.3


From 554604061979d656bbbec50f101526349cd5aa5f Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Mon, 16 Aug 2021 08:41:08 +0300
Subject: net/mlx5: Let user configure max_macs param

Currently, max_macs is taking 70Kbytes of memory per function. This
size is not needed in all use cases, and is critical with large scale.
Hence, allow user to configure the number of max_macs.

For example, to reduce the number of max_macs to 1, execute::
$ devlink dev param set pci/0000:00:0b.0 name max_macs value 1 \
              cmode driverinit
$ devlink dev reload pci/0000:00:0b.0

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 746381eccccf..97465d00de9d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1603,7 +1603,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         ext_stride_num_range[0x1];
 	u8         roce_rw_supported[0x1];
-	u8         reserved_at_3a2[0x1];
+	u8         log_max_current_uc_list_wr_supported[0x1];
 	u8         log_max_stride_sz_rq[0x5];
 	u8         reserved_at_3a8[0x3];
 	u8         log_min_stride_sz_rq[0x5];
-- 
cgit v1.2.3


From 95cadae320be46583078690ac89ffe63c95cc9d2 Mon Sep 17 00:00:00 2001
From: Qian Cai <quic_qiancai@quicinc.com>
Date: Mon, 25 Oct 2021 17:05:28 -0400
Subject: fortify: strlen: Avoid shadowing previous locals

The __compiletime_strlen() macro expansion will shadow p_size and p_len
local variables. No callers currently use any of the shadowed names
for their "p" variable, so there are no code generation problems.

Add "__" prefixes to variable definitions __compiletime_strlen() to
avoid new W=2 warnings:

./include/linux/fortify-string.h: In function 'strnlen':
./include/linux/fortify-string.h:17:9: warning: declaration of 'p_size' shadows a previous local [-Wshadow]
   17 |  size_t p_size = __builtin_object_size(p, 1); \
      |         ^~~~~~
./include/linux/fortify-string.h:77:17: note: in expansion of macro '__compiletime_strlen'
   77 |  size_t p_len = __compiletime_strlen(p);
      |                 ^~~~~~~~~~~~~~~~~~~~
./include/linux/fortify-string.h:76:9: note: shadowed declaration is here
   76 |  size_t p_size = __builtin_object_size(p, 1);
      |         ^~~~~~

Signed-off-by: Qian Cai <quic_qiancai@quicinc.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20211025210528.261643-1-quic_qiancai@quicinc.com
---
 include/linux/fortify-string.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index fdb0a74c9ca2..a6cd6815f249 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -10,18 +10,18 @@ void __read_overflow(void) __compiletime_error("detected read beyond size of obj
 void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)");
 void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)");
 
-#define __compiletime_strlen(p)				\
-({							\
-	unsigned char *__p = (unsigned char *)(p);      \
-	size_t ret = (size_t)-1;			\
-	size_t p_size = __builtin_object_size(p, 1);	\
-	if (p_size != (size_t)-1) {			\
-		size_t p_len = p_size - 1;		\
-		if (__builtin_constant_p(__p[p_len]) &&	\
-		    __p[p_len] == '\0')			\
-			ret = __builtin_strlen(__p);	\
-	}						\
-	ret;						\
+#define __compiletime_strlen(p)					\
+({								\
+	unsigned char *__p = (unsigned char *)(p);		\
+	size_t __ret = (size_t)-1;				\
+	size_t __p_size = __builtin_object_size(p, 1);		\
+	if (__p_size != (size_t)-1) {				\
+		size_t __p_len = __p_size - 1;			\
+		if (__builtin_constant_p(__p[__p_len]) &&	\
+		    __p[__p_len] == '\0')			\
+			__ret = __builtin_strlen(__p);		\
+	}							\
+	__ret;							\
 })
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
-- 
cgit v1.2.3


From ef57c1610dd8fba5031bf71e0db73356190de151 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 25 Oct 2021 09:48:17 -0700
Subject: ipv6: move inet6_sk(sk)->rx_dst_cookie to sk->sk_rx_dst_cookie

Increase cache locality by moving rx_dst_coookie next to sk->sk_rx_dst

This removes one or two cache line misses in IPv6 early demux (TCP/UDP)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ipv6.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ef4a69865737..c383630d3f06 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -282,7 +282,6 @@ struct ipv6_pinfo {
 	__be32			rcv_flowinfo;
 
 	__u32			dst_cookie;
-	__u32			rx_dst_cookie;
 
 	struct ipv6_mc_socklist	__rcu *ipv6_mc_list;
 	struct ipv6_ac_socklist	*ipv6_ac_list;
-- 
cgit v1.2.3


From 79ca6f74dae067681a779fd573c2eb59649989bc Mon Sep 17 00:00:00 2001
From: Hao Wu <hao.wu@rubrik.com>
Date: Wed, 8 Sep 2021 02:26:06 -0700
Subject: tpm: fix Atmel TPM crash caused by too frequent queries

The Atmel TPM 1.2 chips crash with error
`tpm_try_transmit: send(): error -62` since kernel 4.14.
It is observed from the kernel log after running `tpm_sealdata -z`.
The error thrown from the command is as follows
```
$ tpm_sealdata -z
Tspi_Key_LoadKey failed: 0x00001087 - layer=tddl,
code=0087 (135), I/O error
```

The issue was reproduced with the following Atmel TPM chip:
```
$ tpm_version
T0  TPM 1.2 Version Info:
  Chip Version:        1.2.66.1
  Spec Level:          2
  Errata Revision:     3
  TPM Vendor ID:       ATML
  TPM Version:         01010000
  Manufacturer Info:   41544d4c
```

The root cause of the issue is due to the TPM calls to msleep()
were replaced with usleep_range() [1], which reduces
the actual timeout. Via experiments, it is observed that
the original msleep(5) actually sleeps for 15ms.
Because of a known timeout issue in Atmel TPM 1.2 chip,
the shorter timeout than 15ms can cause the error described above.

A few further changes in kernel 4.16 [2] and 4.18 [3, 4] further
reduced the timeout to less than 1ms. With experiments,
the problematic timeout in the latest kernel is the one
for `wait_for_tpm_stat`.

To fix it, the patch reverts the timeout of `wait_for_tpm_stat`
to 15ms for all Atmel TPM 1.2 chips, but leave it untouched
for Ateml TPM 2.0 chip, and chips from other vendors.
As explained above, the chosen 15ms timeout is
the actual timeout before this issue introduced,
thus the old value is used here.
Particularly, TPM_ATML_TIMEOUT_WAIT_STAT_MIN is set to 14700us,
TPM_ATML_TIMEOUT_WAIT_STAT_MIN is set to 15000us according to
the existing TPM_TIMEOUT_RANGE_US (300us).
The fixed has been tested in the system with the affected Atmel chip
with no issues observed after boot up.

References:
[1] 9f3fc7bcddcb tpm: replace msleep() with usleep_range() in TPM
1.2/2.0 generic drivers
[2] cf151a9a44d5 tpm: reduce tpm polling delay in tpm_tis_core
[3] 59f5a6b07f64 tpm: reduce poll sleep time in tpm_transmit()
[4] 424eaf910c32 tpm: reduce polling time to usecs for even finer
granularity

Fixes: 9f3fc7bcddcb ("tpm: replace msleep() with usleep_range() in TPM 1.2/2.0 generic drivers")
Link: https://patchwork.kernel.org/project/linux-integrity/patch/20200926223150.109645-1-hao.wu@rubrik.com/
Signed-off-by: Hao Wu <hao.wu@rubrik.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/tpm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index aa11fe323c56..12d827734686 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -269,6 +269,7 @@ enum tpm2_cc_attrs {
 #define TPM_VID_INTEL    0x8086
 #define TPM_VID_WINBOND  0x1050
 #define TPM_VID_STM      0x104A
+#define TPM_VID_ATML     0x1114
 
 enum tpm_chip_flags {
 	TPM_CHIP_FLAG_TPM2		= BIT(1),
-- 
cgit v1.2.3


From 1bdda24c4af64cd2d65dec5192ab624c5fee7ca0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Oct 2021 15:55:05 -0700
Subject: signal: Add an optional check for altstack size

New x86 FPU features will be very large, requiring ~10k of stack in
signal handlers.  These new features require a new approach called
"dynamic features".

The kernel currently tries to ensure that altstacks are reasonably
sized. Right now, on x86, sys_sigaltstack() requires a size of >=2k.
However, that 2k is a constant. Simply raising that 2k requirement
to >10k for the new features would break existing apps which have a
compiled-in size of 2k.

Instead of universally enforcing a larger stack, prohibit a process from
using dynamic features without properly-sized altstacks. This must be
enforced in two places:

 * A dynamic feature can not be enabled without an large-enough altstack
   for each process thread.
 * Once a dynamic feature is enabled, any request to install a too-small
   altstack will be rejected

The dynamic feature enabling code must examine each thread in a
process to ensure that the altstacks are large enough. Add a new lock
(sigaltstack_lock()) to ensure that threads can not race and change
their altstack after being examined.

Add the infrastructure in form of a config option and provide empty
stubs for architectures which do not need dynamic altstack size checks.

This implementation will be fleshed out for x86 in a future patch called

  x86/arch_prctl: Add controls for dynamic XSTATE components

  [dhansen: commit message. ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-2-chang.seok.bae@intel.com
---
 include/linux/signal.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 3f96a6374e4f..7d34105e20c6 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -464,6 +464,12 @@ int __save_altstack(stack_t __user *, unsigned long);
 	unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
 } while (0);
 
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+bool sigaltstack_size_valid(size_t ss_size);
+#else
+static inline bool sigaltstack_size_valid(size_t size) { return true; }
+#endif /* !CONFIG_DYNAMIC_SIGFRAME */
+
 #ifdef CONFIG_PROC_FS
 struct seq_file;
 extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
-- 
cgit v1.2.3


From 0953fb263714e1c8c1c3d395036d9a14310081dd Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 20 Oct 2021 20:23:09 +0100
Subject: irq: remove handle_domain_{irq,nmi}()

Now that entry code handles IRQ entry (including setting the IRQ regs)
before calling irqchip code, irqchip code can safely call
generic_handle_domain_irq(), and there's no functional reason for it to
call handle_domain_irq().

Let's cement this split of responsibility and remove handle_domain_irq()
entirely, updating irqchip drivers to call generic_handle_domain_irq().

For consistency, handle_domain_nmi() is similarly removed and replaced
with a generic_handle_domain_nmi() function which also does not perform
any entry logic.

Previously handle_domain_{irq,nmi}() had a WARN_ON() which would fire
when they were called in an inappropriate context. So that we can
identify similar issues going forward, similar WARN_ON_ONCE() logic is
added to the generic_handle_*() functions, and comments are updated for
clarity and consistency.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdesc.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 59aea39785bf..93d270ca0c56 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -168,14 +168,7 @@ int generic_handle_irq(unsigned int irq);
  * conversion failed.
  */
 int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq);
-
-#ifdef CONFIG_HANDLE_DOMAIN_IRQ
-int handle_domain_irq(struct irq_domain *domain,
-		      unsigned int hwirq, struct pt_regs *regs);
-
-int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
-		      struct pt_regs *regs);
-#endif
+int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq);
 #endif
 
 /* Test to see if a driver has successfully requested an irq */
-- 
cgit v1.2.3


From 8d15a7295d33538954df2bca6a4011e4311a9cc2 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 21 Oct 2021 18:04:14 +0100
Subject: genirq: Hide irq_cpu_{on,off}line() behind a deprecated option

irq_cpu_{on,off}line() are now only used by the Octeon platform.
Make their use conditional on this plaform being enabled, and
otherwise hidden away.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Link: https://lore.kernel.org/r/20211021170414.3341522-4-maz@kernel.org
---
 include/linux/irq.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c8293c817646..0c746d325206 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -524,9 +524,10 @@ struct irq_chip {
 	void		(*irq_bus_lock)(struct irq_data *data);
 	void		(*irq_bus_sync_unlock)(struct irq_data *data);
 
+#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE
 	void		(*irq_cpu_online)(struct irq_data *data);
 	void		(*irq_cpu_offline)(struct irq_data *data);
-
+#endif
 	void		(*irq_suspend)(struct irq_data *data);
 	void		(*irq_resume)(struct irq_data *data);
 	void		(*irq_pm_shutdown)(struct irq_data *data);
@@ -606,8 +607,10 @@ struct irqaction;
 extern int setup_percpu_irq(unsigned int irq, struct irqaction *new);
 extern void remove_percpu_irq(unsigned int irq, struct irqaction *act);
 
+#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE
 extern void irq_cpu_online(void);
 extern void irq_cpu_offline(void);
+#endif
 extern int irq_set_affinity_locked(struct irq_data *data,
 				   const struct cpumask *cpumask, bool force);
 extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info);
-- 
cgit v1.2.3


From 99ce45d5e7dbde399997a630f45ac9f654fa4bcc Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Tue, 26 Oct 2021 09:57:28 +0800
Subject: mctp: Implement extended addressing

This change allows an extended address struct - struct sockaddr_mctp_ext
- to be passed to sendmsg/recvmsg. This allows userspace to specify
output ifindex and physical address information (for sendmsg) or receive
the input ifindex/physaddr for incoming messages (for recvmsg). This is
typically used by userspace for MCTP address discovery and assignment
operations.

The extended addressing facility is conditional on a new sockopt:
MCTP_OPT_ADDR_EXT; userspace must explicitly enable addressing before
the kernel will consume/populate the extended address data.

Includes a fix for an uninitialised var:
Reported-by: kernel test robot <lkp@intel.com>

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 7612d760b6a9..8ef26d89ef49 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -365,6 +365,7 @@ struct ucred {
 #define SOL_TLS		282
 #define SOL_XDP		283
 #define SOL_MPTCP	284
+#define SOL_MCTP	285
 
 /* IPX options */
 #define IPX_TYPE	1
-- 
cgit v1.2.3


From 8e20f591f204f8db7f1182918f8e2285d3f589e0 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 26 Oct 2021 11:06:01 +0100
Subject: net: phy: add phy_interface_t bitmap support

Add support for a bitmap for phy interface modes, which includes:
- a macro to declare the interface bitmap
- an inline helper to zero the interface bitmap
- an inline helper to detect an empty interface bitmap
- inline helpers to do a bitwise AND and OR operations on two interface
  bitmaps

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 04e90423fa88..96e43fbb2dd8 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -155,6 +155,40 @@ typedef enum {
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
+/* PHY interface mode bitmap handling */
+#define DECLARE_PHY_INTERFACE_MASK(name) \
+	DECLARE_BITMAP(name, PHY_INTERFACE_MODE_MAX)
+
+static inline void phy_interface_zero(unsigned long *intf)
+{
+	bitmap_zero(intf, PHY_INTERFACE_MODE_MAX);
+}
+
+static inline bool phy_interface_empty(const unsigned long *intf)
+{
+	return bitmap_empty(intf, PHY_INTERFACE_MODE_MAX);
+}
+
+static inline void phy_interface_and(unsigned long *dst, const unsigned long *a,
+				     const unsigned long *b)
+{
+	bitmap_and(dst, a, b, PHY_INTERFACE_MODE_MAX);
+}
+
+static inline void phy_interface_or(unsigned long *dst, const unsigned long *a,
+				    const unsigned long *b)
+{
+	bitmap_or(dst, a, b, PHY_INTERFACE_MODE_MAX);
+}
+
+static inline void phy_interface_set_rgmii(unsigned long *intf)
+{
+	__set_bit(PHY_INTERFACE_MODE_RGMII, intf);
+	__set_bit(PHY_INTERFACE_MODE_RGMII_ID, intf);
+	__set_bit(PHY_INTERFACE_MODE_RGMII_RXID, intf);
+	__set_bit(PHY_INTERFACE_MODE_RGMII_TXID, intf);
+}
+
 /*
  * phy_supported_speeds - return all speeds currently supported by a PHY device
  */
-- 
cgit v1.2.3


From 38c310eb46f5f80213a92093af11af270c209a76 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 26 Oct 2021 11:06:06 +0100
Subject: net: phylink: add MAC phy_interface_t bitmap

Add a phy_interface_t bitmap so the MAC driver can specifiy which PHY
interface modes it supports.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index f7b5ed06a815..bc4b866cd99b 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -76,6 +76,7 @@ struct phylink_config {
 	bool ovr_an_inband;
 	void (*get_fixed_state)(struct phylink_config *config,
 				struct phylink_link_state *state);
+	DECLARE_PHY_INTERFACE_MASK(supported_interfaces);
 };
 
 /**
-- 
cgit v1.2.3


From d25f3a74f30aace819163dfa54f2a4b8ca1dc932 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 26 Oct 2021 11:06:11 +0100
Subject: net: phylink: use supported_interfaces for phylink validation

If the network device supplies a supported interface bitmap, we can use
that during phylink's validation to simplify MAC drivers in two ways by
using the supported_interfaces bitmap to:

1. reject unsupported interfaces before calling into the MAC driver.
2. generate the set of all supported link modes across all supported
   interfaces (used mainly for SFP, but also some 10G PHYs.)

Suggested-by: Sean Anderson <sean.anderson@seco.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index bc4b866cd99b..f037470b6fb3 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -67,6 +67,8 @@ enum phylink_op_type {
  * @ovr_an_inband: if true, override PCS to MLO_AN_INBAND
  * @get_fixed_state: callback to execute to determine the fixed link state,
  *		     if MAC link is at %MLO_AN_FIXED mode.
+ * @supported_interfaces: bitmap describing which PHY_INTERFACE_MODE_xxx
+ *                        are supported by the MAC/PCS.
  */
 struct phylink_config {
 	struct device *dev;
@@ -134,8 +136,14 @@ struct phylink_mac_ops {
  * based on @state->advertising and/or @state->speed and update
  * @state->interface accordingly. See phylink_helper_basex_speed().
  *
- * When @state->interface is %PHY_INTERFACE_MODE_NA, phylink expects the
- * MAC driver to return all supported link modes.
+ * When @config->supported_interfaces has been set, phylink will iterate
+ * over the supported interfaces to determine the full capability of the
+ * MAC. The validation function must not print errors if @state->interface
+ * is set to an unexpected value.
+ *
+ * When @config->supported_interfaces is empty, phylink will call this
+ * function with @state->interface set to %PHY_INTERFACE_MODE_NA, and
+ * expects the MAC driver to return all supported link modes.
  *
  * If the @state->interface mode is not supported, then the @supported
  * mask must be cleared.
-- 
cgit v1.2.3


From e60feb445fce9e51c1558a6aa7faf9dd5ded533b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 14 Oct 2021 13:11:00 -0400
Subject: fs: export an inode_update_time helper

If you already have an inode and need to update the time on the inode
there is no way to do this properly.  Export this helper to allow file
systems to update time on the inode so the appropriate handler is
called, either ->update_time or generic_update_time.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e7a633353fd2..56eba723477e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2498,6 +2498,8 @@ enum file_time_flags {
 
 extern bool atime_needs_update(const struct path *, struct inode *);
 extern void touch_atime(const struct path *);
+int inode_update_time(struct inode *inode, struct timespec64 *time, int flags);
+
 static inline void file_accessed(struct file *file)
 {
 	if (!(file->f_flags & O_NOATIME))
-- 
cgit v1.2.3


From 6b3671746a8a3aa05316b829e1357060f35009c1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 26 Oct 2021 08:29:39 -0700
Subject: net/mlx5: remove the recent devlink params

revert commit 46ae40b94d88 ("net/mlx5: Let user configure io_eq_size param")
revert commit a6cb08daa3b4 ("net/mlx5: Let user configure event_eq_size param")
revert commit 554604061979 ("net/mlx5: Let user configure max_macs param")

The EQE parameters are applicable to more drivers, they should
be configured via standard API, probably ethtool. Example of
another driver needing something similar:

https://lore.kernel.org/all/1633454136-14679-3-git-send-email-sbhatta@marvell.com/

The last param for "max_macs" is probably fine but the documentation
is severely lacking. The meaning and implications for changing the
param need to be stated.

Link: https://lore.kernel.org/r/20211026152939.3125950-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/driver.h   | 4 ++++
 include/linux/mlx5/eq.h       | 1 +
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 47c07f95bbe1..f617dfbcd9fd 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -797,6 +797,10 @@ struct mlx5_db {
 	int			index;
 };
 
+enum {
+	MLX5_COMP_EQ_SIZE = 1024,
+};
+
 enum {
 	MLX5_PTYS_IB = 1 << 0,
 	MLX5_PTYS_EN = 1 << 2,
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index 11161e427608..ea3ff5a8ced3 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -5,6 +5,7 @@
 #define MLX5_CORE_EQ_H
 
 #define MLX5_NUM_CMD_EQE   (32)
+#define MLX5_NUM_ASYNC_EQE (0x1000)
 #define MLX5_NUM_SPARE_EQE (0x80)
 
 struct mlx5_eq;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 97465d00de9d..746381eccccf 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1603,7 +1603,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         ext_stride_num_range[0x1];
 	u8         roce_rw_supported[0x1];
-	u8         log_max_current_uc_list_wr_supported[0x1];
+	u8         reserved_at_3a2[0x1];
 	u8         log_max_stride_sz_rq[0x5];
 	u8         reserved_at_3a8[0x3];
 	u8         log_min_stride_sz_rq[0x5];
-- 
cgit v1.2.3


From cfe6807d82e97e81c3209dca9448f091e1448a57 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 26 Oct 2021 18:58:11 +0100
Subject: gpio: Allow per-parent interrupt data

The core gpiolib code is able to deal with multiple interrupt parents
for a single gpio irqchip. It however only allows a single piece
of data to be conveyed to all flow handlers (either the gpio_chip
or some other, driver-specific data).

This means that drivers have to go through some interesting dance
to find the correct context, something that isn't great in interrupt
context (see aebdc8abc9db86e2bd33070fc2f961012fff74b4 for a prime
example).

Instead, offer an optional way for a pinctrl/gpio driver to provide
an array of pointers which gets used to provide the correct context
to the flow handler.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20211026175815.52703-2-joey.gouly@arm.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index a0f9901dcae6..a673a359e20b 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -168,11 +168,18 @@ struct gpio_irq_chip {
 
 	/**
 	 * @parent_handler_data:
+	 * @parent_handler_data_array:
 	 *
 	 * Data associated, and passed to, the handler for the parent
-	 * interrupt.
+	 * interrupt. Can either be a single pointer if @per_parent_data
+	 * is false, or an array of @num_parents pointers otherwise.  If
+	 * @per_parent_data is true, @parent_handler_data_array cannot be
+	 * NULL.
 	 */
-	void *parent_handler_data;
+	union {
+		void *parent_handler_data;
+		void **parent_handler_data_array;
+	};
 
 	/**
 	 * @num_parents:
@@ -203,6 +210,14 @@ struct gpio_irq_chip {
 	 */
 	bool threaded;
 
+	/**
+	 * @per_parent_data:
+	 *
+	 * True if parent_handler_data_array describes a @num_parents
+	 * sized array to be used as parent data.
+	 */
+	bool per_parent_data;
+
 	/**
 	 * @init_hw: optional routine to initialize hardware before
 	 * an IRQ chip will be added. This is quite useful when
-- 
cgit v1.2.3


From 7529cc7fbd9c02eda6851f3260416cbe198a321d Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Wed, 30 Dec 2020 11:41:52 +0200
Subject: lib: bitmap: Introduce node-aware alloc API

Expose new node-aware API for bitmap allocation:
bitmap_alloc_node() / bitmap_zalloc_node().

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/bitmap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 37f36dad18bd..a241dcf50f39 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -123,6 +123,8 @@ struct device;
  */
 unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
 unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
+unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node);
+unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node);
 void bitmap_free(const unsigned long *bitmap);
 
 /* Managed variants of the above. */
-- 
cgit v1.2.3


From 50f477fe9933193e960785f1192be801d7cd307a Mon Sep 17 00:00:00 2001
From: Ben Ben-Ishay <benishay@nvidia.com>
Date: Thu, 2 Jul 2020 17:22:45 +0300
Subject: net/mlx5e: Rename lro_timeout to packet_merge_timeout

TIR stands for transport interface receive, the TIR object is
responsible for performing all transport related operations on
the receive side like packet processing, demultiplexing the packets
to different RQ's, etc.
lro_timeout is a field in the TIR that is used to set the timeout for lro
session, this series introduces new packet merge type, therefore rename
lro_timeout to packet_merge_timeout for all packet merge types.

Signed-off-by: Ben Ben-Ishay <benishay@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 746381eccccf..21c0fd478afa 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -3361,8 +3361,8 @@ enum {
 };
 
 enum {
-	MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO  = 0x1,
-	MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO  = 0x2,
+	MLX5_TIRC_PACKET_MERGE_MASK_IPV4_LRO  = BIT(0),
+	MLX5_TIRC_PACKET_MERGE_MASK_IPV6_LRO  = BIT(1),
 };
 
 enum {
@@ -3387,7 +3387,7 @@ struct mlx5_ifc_tirc_bits {
 
 	u8         reserved_at_80[0x4];
 	u8         lro_timeout_period_usecs[0x10];
-	u8         lro_enable_mask[0x4];
+	u8         packet_merge_mask[0x4];
 	u8         lro_max_ip_payload_size[0x8];
 
 	u8         reserved_at_a0[0x40];
-- 
cgit v1.2.3


From 7025329d208cae45937d2a0910786a45b9981475 Mon Sep 17 00:00:00 2001
From: Ben Ben-Ishay <benishay@nvidia.com>
Date: Wed, 9 Sep 2020 17:36:39 +0300
Subject: net/mlx5: Add SHAMPO caps, HW bits and enumerations

This commit adds SHAMPO bit to hca_cap and SHAMPO capabilities structure,
SHAMPO related HW spec hardware fields and enumerations.
SHAMPO stands for: split headers and merge payload offload.
SHAMPO new fields:
WQ:
 - headers_mkey: mkey that represents the headers buffer, where the packets
   headers will be written by the HW.

 - shampo_enable: flag to verify if the WQ supports SHAMPO feature.

 - log_reservation_size: the log of the reservation size where the data of
   the packet will be written by the HW.

 - log_max_num_of_packets_per_reservation: log of the maximum number of
   packets that can be written to the same reservation.

 - log_headers_entry_size: log of the header entry size of the headers buffer.

 - log_headers_buffer_entry_num: log of the entries number of the headers buffer.

RQ:
 - shampo_no_match_alignment_granularity: the HW alignment granularity
   in case the received packet doesn't match the current session.

 - shampo_match_criteria_type: the type of match criteria.

 - reservation_timeout: the maximum time that the HW will hold the
   reservation.

mlx5_ifc_shampo_cap_bits, the capabilities of the SHAMPO feature:
 - shampo_log_max_reservation_size: the maximum allowed value of the field
   WQ.log_reservation_size.

 - log_reservation_size: the minimum allowed value of the field
   WQ.log_reservation_size.

 - shampo_min_mss_size: the minimum payload size of packet that can open
   a new session or be merged to a session.

 - shampo_max_log_headers_entry_size: the maximum allowed value of the field
   WQ.log_headers_entry_size

Signed-off-by: Ben Ben-Ishay <benishay@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h   |  4 ++++
 include/linux/mlx5/mlx5_ifc.h | 56 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 57 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index f8a0bbb42c3b..0d30a6184e1d 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1186,6 +1186,7 @@ enum mlx5_cap_type {
 	MLX5_CAP_VDPA_EMULATION = 0x13,
 	MLX5_CAP_DEV_EVENT = 0x14,
 	MLX5_CAP_IPSEC,
+	MLX5_CAP_DEV_SHAMPO = 0x1d,
 	MLX5_CAP_GENERAL_2 = 0x20,
 	MLX5_CAP_PORT_SELECTION = 0x25,
 	/* NUM OF CAP Types */
@@ -1431,6 +1432,9 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_IPSEC(mdev, cap)\
 	MLX5_GET(ipsec_cap, (mdev)->caps.hca[MLX5_CAP_IPSEC]->cur, cap)
 
+#define MLX5_CAP_DEV_SHAMPO(mdev, cap)\
+	MLX5_GET(shampo_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_SHAMPO], cap)
+
 enum {
 	MLX5_CMD_STAT_OK			= 0x0,
 	MLX5_CMD_STAT_INT_ERR			= 0x1,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 21c0fd478afa..f1c134af5fcf 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1350,7 +1350,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_b0[0x1];
 	u8         uplink_follow[0x1];
 	u8         ts_cqe_to_dest_cqn[0x1];
-	u8         reserved_at_b3[0xd];
+	u8         reserved_at_b3[0x7];
+	u8         shampo[0x1];
+	u8         reserved_at_bb[0x5];
 
 	u8         max_sgl_for_optimized_performance[0x8];
 	u8         log_max_cq_sz[0x8];
@@ -1893,7 +1895,21 @@ struct mlx5_ifc_wq_bits {
 	u8         reserved_at_139[0x4];
 	u8         log_wqe_stride_size[0x3];
 
-	u8         reserved_at_140[0x4c0];
+	u8         reserved_at_140[0x80];
+
+	u8         headers_mkey[0x20];
+
+	u8         shampo_enable[0x1];
+	u8         reserved_at_1e1[0x4];
+	u8         log_reservation_size[0x3];
+	u8         reserved_at_1e8[0x5];
+	u8         log_max_num_of_packets_per_reservation[0x3];
+	u8         reserved_at_1f0[0x6];
+	u8         log_headers_entry_size[0x2];
+	u8         reserved_at_1f8[0x4];
+	u8         log_headers_buffer_entry_num[0x4];
+
+	u8         reserved_at_200[0x400];
 
 	struct mlx5_ifc_cmd_pas_bits pas[];
 };
@@ -3169,6 +3185,20 @@ struct mlx5_ifc_roce_addr_layout_bits {
 	u8         reserved_at_e0[0x20];
 };
 
+struct mlx5_ifc_shampo_cap_bits {
+	u8    reserved_at_0[0x3];
+	u8    shampo_log_max_reservation_size[0x5];
+	u8    reserved_at_8[0x3];
+	u8    shampo_log_min_reservation_size[0x5];
+	u8    shampo_min_mss_size[0x10];
+
+	u8    reserved_at_20[0x3];
+	u8    shampo_max_log_headers_entry_size[0x5];
+	u8    reserved_at_28[0x18];
+
+	u8    reserved_at_40[0x7c0];
+};
+
 union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
 	struct mlx5_ifc_cmd_hca_cap_2_bits cmd_hca_cap_2;
@@ -3187,6 +3217,7 @@ union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_tls_cap_bits tls_cap;
 	struct mlx5_ifc_device_mem_cap_bits device_mem_cap;
 	struct mlx5_ifc_virtio_emulation_cap_bits virtio_emulation_cap;
+	struct mlx5_ifc_shampo_cap_bits shampo_cap;
 	u8         reserved_at_0[0x8000];
 };
 
@@ -3363,6 +3394,7 @@ enum {
 enum {
 	MLX5_TIRC_PACKET_MERGE_MASK_IPV4_LRO  = BIT(0),
 	MLX5_TIRC_PACKET_MERGE_MASK_IPV6_LRO  = BIT(1),
+	MLX5_TIRC_PACKET_MERGE_MASK_SHAMPO    = BIT(2),
 };
 
 enum {
@@ -3569,6 +3601,18 @@ enum {
 	MLX5_RQC_STATE_ERR  = 0x3,
 };
 
+enum {
+	MLX5_RQC_SHAMPO_NO_MATCH_ALIGNMENT_GRANULARITY_BYTE    = 0x0,
+	MLX5_RQC_SHAMPO_NO_MATCH_ALIGNMENT_GRANULARITY_STRIDE  = 0x1,
+	MLX5_RQC_SHAMPO_NO_MATCH_ALIGNMENT_GRANULARITY_PAGE    = 0x2,
+};
+
+enum {
+	MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_NO_MATCH    = 0x0,
+	MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_EXTENDED    = 0x1,
+	MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_FIVE_TUPLE  = 0x2,
+};
+
 struct mlx5_ifc_rqc_bits {
 	u8         rlky[0x1];
 	u8	   delay_drop_en[0x1];
@@ -3601,7 +3645,13 @@ struct mlx5_ifc_rqc_bits {
 	u8         reserved_at_c0[0x10];
 	u8         hairpin_peer_vhca[0x10];
 
-	u8         reserved_at_e0[0xa0];
+	u8         reserved_at_e0[0x46];
+	u8         shampo_no_match_alignment_granularity[0x2];
+	u8         reserved_at_128[0x6];
+	u8         shampo_match_criteria_type[0x2];
+	u8         reservation_timeout[0x10];
+
+	u8         reserved_at_140[0x40];
 
 	struct mlx5_ifc_wq_bits wq;
 };
-- 
cgit v1.2.3


From eaee12f046924eeb1210c7e4f3b326603ff1bd85 Mon Sep 17 00:00:00 2001
From: Khalid Manaa <khalidm@nvidia.com>
Date: Wed, 9 Jun 2021 12:27:32 +0300
Subject: net/mlx5e: Rename TIR lro functions to TIR packet merge functions

This series introduces new packet merge type, therefore rename lro
functions to packet merge to support the new merge type:
- Generalize + rename mlx5e_build_tir_ctx_lro to
  mlx5e_build_tir_ctx_packet_merge.
- Rename mlx5e_modify_tirs_lro to mlx5e_modify_tirs_packet_merge.
- Rename lro bit in mlx5_ifc_modify_tir_bitmask_bits to packet_merge.
- Rename lro_en in mlx5e_params to packet_merge_type type and combine
  packet_merge params into one struct mlx5e_packet_merge_param.

Signed-off-by: Khalid Manaa <khalidm@nvidia.com>
Signed-off-by: Ben Ben-Ishay <benishay@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f1c134af5fcf..0bb78c04336c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -6707,7 +6707,7 @@ struct mlx5_ifc_modify_tir_bitmask_bits {
 	u8         reserved_at_3c[0x1];
 	u8         hash[0x1];
 	u8         reserved_at_3e[0x1];
-	u8         lro[0x1];
+	u8         packet_merge[0x1];
 };
 
 struct mlx5_ifc_modify_tir_out_bits {
-- 
cgit v1.2.3


From d7b896acbdcb3ef5dab1fd2f33ba5a8da6ba1dda Mon Sep 17 00:00:00 2001
From: Ben Ben-Ishay <benishay@nvidia.com>
Date: Tue, 14 Jul 2020 14:40:32 +0300
Subject: net/mlx5e: Add support to klm_umr_wqe

This commit adds the needed definitions for using the klm_umr_wqe.
UMR stands for user-mode memory registration, is a mechanism to alter
address translation properties of MKEY by posting WorkQueueElement
aka WQE on send queue.
MKEY stands for memory key, MKEY are used to describe a region in memory that
can be later used by HW.
KLM stands for {Key, Length, MemVa}, KLM_MKEY is indirect MKEY that enables
to map multiple memory spaces with different sizes in unified MKEY.
klm_umr_wqe is a UMR that use to update a KLM_MKEY.
SHAMPO feature uses KLM_MKEY for memory registration of his header buffer.

Signed-off-by: Ben Ben-Ishay <benishay@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 0d30a6184e1d..c920e5932368 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -290,6 +290,7 @@ enum {
 	MLX5_UMR_INLINE			= (1 << 7),
 };
 
+#define MLX5_UMR_KLM_ALIGNMENT 4
 #define MLX5_UMR_MTT_ALIGNMENT 0x40
 #define MLX5_UMR_MTT_MASK      (MLX5_UMR_MTT_ALIGNMENT - 1)
 #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT
-- 
cgit v1.2.3


From f97d5c2a453e26071e3b0ec12161de57c4a237c4 Mon Sep 17 00:00:00 2001
From: Khalid Manaa <khalidm@nvidia.com>
Date: Tue, 19 May 2020 15:45:38 +0300
Subject: net/mlx5e: Add handle SHAMPO cqe support

This patch adds the new CQE SHAMPO fields:
- flush: indicates that we must close the current session and pass the SKB
         to the network stack.

- match: indicates that the current packet matches the oppened session,
         the packet will be merge into the current SKB.

- header_size: the size of the packet headers that written into the headers
               buffer.

- header_entry_index: the entry index in the headers buffer.

- data_offset: packets data offset in the WQE.

Also new cqe handler is added to handle SHAMPO packets:
- The new handler uses CQE SHAMPO fields to build the SKB.
  CQE's Flush and match fields are not used in this patch, packets are not
  merged in this patch.

Signed-off-by: Khalid Manaa <khalidm@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index c920e5932368..56bcf95d4ab7 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -800,10 +800,23 @@ struct mlx5_cqe64 {
 	u8		tls_outer_l3_tunneled;
 	u8		rsvd0;
 	__be16		wqe_id;
-	u8		lro_tcppsh_abort_dupack;
-	u8		lro_min_ttl;
-	__be16		lro_tcp_win;
-	__be32		lro_ack_seq_num;
+	union {
+		struct {
+			u8	tcppsh_abort_dupack;
+			u8	min_ttl;
+			__be16	tcp_win;
+			__be32	ack_seq_num;
+		} lro;
+		struct {
+			u8	reserved0:1;
+			u8	match:1;
+			u8	flush:1;
+			u8	reserved3:5;
+			u8	header_size;
+			__be16	header_entry_index;
+			__be32	data_offset;
+		} shampo;
+	};
 	__be32		rss_hash_result;
 	u8		rss_hash_type;
 	u8		ml_path;
@@ -873,7 +886,7 @@ static inline u8 get_cqe_opcode(struct mlx5_cqe64 *cqe)
 
 static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
 {
-	return (cqe->lro_tcppsh_abort_dupack >> 6) & 1;
+	return (cqe->lro.tcppsh_abort_dupack >> 6) & 1;
 }
 
 static inline u8 get_cqe_l4_hdr_type(struct mlx5_cqe64 *cqe)
-- 
cgit v1.2.3


From a2247f19ee1c5ad75ef095cdfb909a3244b88aa8 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 27 Oct 2021 11:22:19 +0900
Subject: block: Add independent access ranges support

The Concurrent Positioning Ranges VPD page (for SCSI) and data log page
(for ATA) contain parameters describing the set of contiguous LBAs that
can be served independently by a single LUN multi-actuator hard-disk.
Similarly, a logically defined block device composed of multiple disks
can in some cases execute requests directed at different sector ranges
in parallel. A dm-linear device aggregating 2 block devices together is
an example.

This patch implements support for exposing a block device independent
access ranges to the user through sysfs to allow optimizing device
accesses to increase performance.

To describe the set of independent sector ranges of a device (actuators
of a multi-actuator HDDs or table entries of a dm-linear device),
The type struct blk_independent_access_ranges is introduced. This
structure describes the sector ranges using an array of
struct blk_independent_access_range structures. This range structure
defines the start sector and number of sectors of the access range.
The ranges in the array cannot overlap and must contain all sectors
within the device capacity.

The function disk_set_independent_access_ranges() allows a device
driver to signal to the block layer that a device has multiple
independent access ranges.  In this case, a struct
blk_independent_access_ranges is attached to the device request queue
by the function disk_set_independent_access_ranges(). The function
disk_alloc_independent_access_ranges() is provided for drivers to
allocate this structure.

struct blk_independent_access_ranges contains kobjects (struct kobject)
to expose to the user through sysfs the set of independent access ranges
supported by a device. When the device is initialized, sysfs
registration of the ranges information is done from blk_register_queue()
using the block layer internal function
disk_register_independent_access_ranges(). If a driver calls
disk_set_independent_access_ranges() for a registered queue, e.g. when a
device is revalidated, disk_set_independent_access_ranges() will execute
disk_register_independent_access_ranges() to update the sysfs attribute
files.  The sysfs file structure created starts from the
independent_access_ranges sub-directory and contains the start sector
and number of sectors of each range, with the information for each range
grouped in numbered sub-directories.

E.g. for a dual actuator HDD, the user sees:

$ tree /sys/block/sdk/queue/independent_access_ranges/
/sys/block/sdk/queue/independent_access_ranges/
|-- 0
|   |-- nr_sectors
|   `-- sector
`-- 1
    |-- nr_sectors
    `-- sector

For a regular device with a single access range, the
independent_access_ranges sysfs directory does not exist.

Device revalidation may lead to changes to this structure and to the
attribute values. When manipulated, the queue sysfs_lock and
sysfs_dir_lock mutexes are held for atomicity, similarly to how the
blk-mq and elevator sysfs queue sub-directories are protected.

The code related to the management of independent access ranges is
added in the new file block/blk-ia-ranges.c.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20211027022223.183838-2-damien.lemoal@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f72ccb2829db..6d95a4b36cfa 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -150,6 +150,34 @@ static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
 
 #endif /* CONFIG_BLK_DEV_ZONED */
 
+/*
+ * Independent access ranges: struct blk_independent_access_range describes
+ * a range of contiguous sectors that can be accessed using device command
+ * execution resources that are independent from the resources used for
+ * other access ranges. This is typically found with single-LUN multi-actuator
+ * HDDs where each access range is served by a different set of heads.
+ * The set of independent ranges supported by the device is defined using
+ * struct blk_independent_access_ranges. The independent ranges must not overlap
+ * and must include all sectors within the disk capacity (no sector holes
+ * allowed).
+ * For a device with multiple ranges, requests targeting sectors in different
+ * ranges can be executed in parallel. A request can straddle an access range
+ * boundary.
+ */
+struct blk_independent_access_range {
+	struct kobject		kobj;
+	struct request_queue	*queue;
+	sector_t		sector;
+	sector_t		nr_sectors;
+};
+
+struct blk_independent_access_ranges {
+	struct kobject				kobj;
+	bool					sysfs_registered;
+	unsigned int				nr_ia_ranges;
+	struct blk_independent_access_range	ia_range[];
+};
+
 struct request_queue {
 	struct request		*last_merge;
 	struct elevator_queue	*elevator;
@@ -331,6 +359,12 @@ struct request_queue {
 
 #define BLK_MAX_WRITE_HINTS	5
 	u64			write_hints[BLK_MAX_WRITE_HINTS];
+
+	/*
+	 * Independent sector access ranges. This is always NULL for
+	 * devices that do not have multiple independent access ranges.
+	 */
+	struct blk_independent_access_ranges *ia_ranges;
 };
 
 /* Keep blk_queue_flag_name[] in sync with the definitions below */
@@ -698,6 +732,11 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
 
+struct blk_independent_access_ranges *
+disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges);
+void disk_set_independent_access_ranges(struct gendisk *disk,
+				struct blk_independent_access_ranges *iars);
+
 /*
  * Elevator features for blk_queue_required_elevator_features:
  */
-- 
cgit v1.2.3


From fe22e1c2f705676a705d821301fc52eecc2fe055 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 27 Oct 2021 11:22:21 +0900
Subject: libata: support concurrent positioning ranges log

Add support to discover if an ATA device supports the Concurrent
Positioning Ranges data log (address 0x47), indicating that the device
is capable of seeking to multiple different locations in parallel using
multiple actuators serving different LBA ranges.

Also add support to translate the concurrent positioning ranges log
into its equivalent Concurrent Positioning Ranges VPD page B9h in
libata-scsi.c.

The format of the Concurrent Positioning Ranges Log is defined in ACS-5
r9.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20211027022223.183838-4-damien.lemoal@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ata.h    |  1 +
 include/linux/libata.h | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 1b44f40c7700..199e47e97d64 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -329,6 +329,7 @@ enum {
 	ATA_LOG_SECURITY	  = 0x06,
 	ATA_LOG_SATA_SETTINGS	  = 0x08,
 	ATA_LOG_ZONED_INFORMATION = 0x09,
+	ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47,
 
 	/* Identify device SATA settings log:*/
 	ATA_LOG_DEVSLP_OFFSET	  = 0x30,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c0c64f03e107..236ec689056a 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -676,6 +676,18 @@ struct ata_ering {
 	struct ata_ering_entry	ring[ATA_ERING_SIZE];
 };
 
+struct ata_cpr {
+	u8			num;
+	u8			num_storage_elements;
+	u64			start_lba;
+	u64			num_lbas;
+};
+
+struct ata_cpr_log {
+	u8			nr_cpr;
+	struct ata_cpr		cpr[];
+};
+
 struct ata_device {
 	struct ata_link		*link;
 	unsigned int		devno;		/* 0 or 1 */
@@ -735,6 +747,9 @@ struct ata_device {
 	u32			zac_zones_optimal_nonseq;
 	u32			zac_zones_max_open;
 
+	/* Concurrent positioning ranges */
+	struct ata_cpr_log	*cpr_log;
+
 	/* error history */
 	int			spdn_cnt;
 	/* ering is CLEAR_END, read comment above CLEAR_END */
-- 
cgit v1.2.3


From 785d584c30ffc1224027536fe55bdc15ee509f14 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 18 Oct 2021 17:21:37 +0200
Subject: nvme: add new discovery log page entry definitions

TP8014 adds a new SUBTYPE value and a new field EFLAGS for the
discovery log page entry.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 357482dedb59..855dd9b3e84b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -27,8 +27,14 @@
 #define NVME_NSID_ALL		0xffffffff
 
 enum nvme_subsys_type {
-	NVME_NQN_DISC	= 1,		/* Discovery type target subsystem */
-	NVME_NQN_NVME	= 2,		/* NVME type target subsystem */
+	/* Referral to another discovery type target subsystem */
+	NVME_NQN_DISC	= 1,
+
+	/* NVME type target subsystem */
+	NVME_NQN_NVME	= 2,
+
+	/* Current discovery type target subsystem */
+	NVME_NQN_CURR	= 3,
 };
 
 enum nvme_ctrl_type {
@@ -1312,6 +1318,12 @@ struct nvmf_common_command {
 
 #define MAX_DISC_LOGS	255
 
+/* Discovery log page entry flags (EFLAGS): */
+enum {
+	NVME_DISC_EFLAGS_EPCSD		= (1 << 1),
+	NVME_DISC_EFLAGS_DUPRETINFO	= (1 << 0),
+};
+
 /* Discovery log page entry */
 struct nvmf_disc_rsp_page_entry {
 	__u8		trtype;
@@ -1321,7 +1333,8 @@ struct nvmf_disc_rsp_page_entry {
 	__le16		portid;
 	__le16		cntlid;
 	__le16		asqsz;
-	__u8		resv8[22];
+	__le16		eflags;
+	__u8		resv10[20];
 	char		trsvcid[NVMF_TRSVCID_SIZE];
 	__u8		resv64[192];
 	char		subnqn[NVMF_NQN_FIELD_LEN];
-- 
cgit v1.2.3


From 33f98a9798f55fd77c36ce79d8cfa5329e55a789 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 13 Oct 2021 10:57:41 -0700
Subject: x86/boot/compressed: Avoid duplicate malloc() implementations

The early malloc() and free() implementation in include/linux/decompress/mm.h
(which is also included by the static decompressors) is static. This is
fine when the only thing interested in using malloc() is the decompression
code, but the x86 early boot environment may use malloc() in a couple places,
leading to a potential collision when the static copies of the available
memory region ("malloc_ptr") gets reset to the global "free_mem_ptr" value.
As it happened, the existing usage pattern was accidentally safe because each
user did 1 malloc() and 1 free() before returning and were not nested:

extract_kernel() (misc.c)
	choose_random_location() (kaslr.c)
		mem_avoid_init()
			handle_mem_options()
				malloc()
				...
				free()
	...
	parse_elf() (misc.c)
		malloc()
		...
		free()

Once the future FGKASLR series is added, however, it will insert
additional malloc() calls local to fgkaslr.c in the middle of
parse_elf()'s malloc()/free() pair:

	parse_elf() (misc.c)
		malloc()
		if (...) {
			layout_randomized_image(output, &ehdr, phdrs);
				malloc() <- boom
				...
		else
			layout_image(output, &ehdr, phdrs);
		free()

To avoid collisions, there must be a single implementation of malloc().
Adjust include/linux/decompress/mm.h so that visibility can be
controlled, provide prototypes in misc.h, and implement the functions in
misc.c. This also results in a small size savings:

$ size vmlinux.before vmlinux.after
   text    data     bss     dec     hex filename
8842314     468  178320 9021102  89a6ae vmlinux.before
8842240     468  178320 9021028  89a664 vmlinux.after

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20211013175742.1197608-4-keescook@chromium.org
---
 include/linux/decompress/mm.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h
index 868e9eacd69e..9192986b1a73 100644
--- a/include/linux/decompress/mm.h
+++ b/include/linux/decompress/mm.h
@@ -25,13 +25,21 @@
 #define STATIC_RW_DATA static
 #endif
 
+/*
+ * When an architecture needs to share the malloc()/free() implementation
+ * between compilation units, it needs to have non-local visibility.
+ */
+#ifndef MALLOC_VISIBLE
+#define MALLOC_VISIBLE static
+#endif
+
 /* A trivial malloc implementation, adapted from
  *  malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
  */
 STATIC_RW_DATA unsigned long malloc_ptr;
 STATIC_RW_DATA int malloc_count;
 
-static void *malloc(int size)
+MALLOC_VISIBLE void *malloc(int size)
 {
 	void *p;
 
@@ -52,7 +60,7 @@ static void *malloc(int size)
 	return p;
 }
 
-static void free(void *where)
+MALLOC_VISIBLE void free(void *where)
 {
 	malloc_count--;
 	if (!malloc_count)
-- 
cgit v1.2.3


From 9baf93d68bcc3d0a6042283b82603c076e25e4f5 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 25 Oct 2021 16:27:16 -0300
Subject: fsnotify: pass data_type to fsnotify_name()

Align the arguments of fsnotify_name() to those of fsnotify().

Link: https://lore.kernel.org/r/20211025192746.66445-2-krisman@collabora.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify.h | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 12d3a7d308ab..d1144d7c3536 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -26,20 +26,21 @@
  * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only
  * the child is interested and not the parent.
  */
-static inline void fsnotify_name(struct inode *dir, __u32 mask,
-				 struct inode *child,
-				 const struct qstr *name, u32 cookie)
+static inline int fsnotify_name(__u32 mask, const void *data, int data_type,
+				struct inode *dir, const struct qstr *name,
+				u32 cookie)
 {
 	if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0)
-		return;
+		return 0;
 
-	fsnotify(mask, child, FSNOTIFY_EVENT_INODE, dir, name, NULL, cookie);
+	return fsnotify(mask, data, data_type, dir, name, NULL, cookie);
 }
 
 static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
 				   __u32 mask)
 {
-	fsnotify_name(dir, mask, d_inode(dentry), &dentry->d_name, 0);
+	fsnotify_name(mask, d_inode(dentry), FSNOTIFY_EVENT_INODE,
+		      dir, &dentry->d_name, 0);
 }
 
 static inline void fsnotify_inode(struct inode *inode, __u32 mask)
@@ -154,8 +155,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		new_dir_mask |= FS_ISDIR;
 	}
 
-	fsnotify_name(old_dir, old_dir_mask, source, old_name, fs_cookie);
-	fsnotify_name(new_dir, new_dir_mask, source, new_name, fs_cookie);
+	fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
+		      old_dir, old_name, fs_cookie);
+	fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
+		      new_dir, new_name, fs_cookie);
 
 	if (target)
 		fsnotify_link_count(target);
@@ -209,7 +212,8 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode,
 	fsnotify_link_count(inode);
 	audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);
 
-	fsnotify_name(dir, FS_CREATE, inode, &new_dentry->d_name, 0);
+	fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE,
+		      dir, &new_dentry->d_name, 0);
 }
 
 /*
-- 
cgit v1.2.3


From fd5a3ff49a19aa69e2bc1e26e98037c2d778e61a Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 25 Oct 2021 16:27:17 -0300
Subject: fsnotify: pass dentry instead of inode data

Define a new data type to pass for event - FSNOTIFY_EVENT_DENTRY.
Use it to pass the dentry instead of it's ->d_inode where available.

This is needed in preparation to the refactor to retrieve the super
block from the data field.  In some cases (i.e. mkdir in kernfs), the
data inode comes from a negative dentry, such that no super block
information would be available. By receiving the dentry itself, instead
of the inode, fsnotify can derive the super block even on these cases.

Link: https://lore.kernel.org/r/20211025192746.66445-3-krisman@collabora.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
[Expand explanation in commit message]
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify.h         |  5 ++---
 include/linux/fsnotify_backend.h | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index d1144d7c3536..df0fa4687a18 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -39,8 +39,7 @@ static inline int fsnotify_name(__u32 mask, const void *data, int data_type,
 static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
 				   __u32 mask)
 {
-	fsnotify_name(mask, d_inode(dentry), FSNOTIFY_EVENT_INODE,
-		      dir, &dentry->d_name, 0);
+	fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0);
 }
 
 static inline void fsnotify_inode(struct inode *inode, __u32 mask)
@@ -87,7 +86,7 @@ notify_child:
  */
 static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
 {
-	fsnotify_parent(dentry, mask, d_inode(dentry), FSNOTIFY_EVENT_INODE);
+	fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
 }
 
 static inline int fsnotify_file(struct file *file, __u32 mask)
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1ce66748a2d2..a2db821e8a8f 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -248,6 +248,7 @@ enum fsnotify_data_type {
 	FSNOTIFY_EVENT_NONE,
 	FSNOTIFY_EVENT_PATH,
 	FSNOTIFY_EVENT_INODE,
+	FSNOTIFY_EVENT_DENTRY,
 };
 
 static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
@@ -255,6 +256,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
 	switch (data_type) {
 	case FSNOTIFY_EVENT_INODE:
 		return (struct inode *)data;
+	case FSNOTIFY_EVENT_DENTRY:
+		return d_inode(data);
 	case FSNOTIFY_EVENT_PATH:
 		return d_inode(((const struct path *)data)->dentry);
 	default:
@@ -262,6 +265,19 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
 	}
 }
 
+static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type)
+{
+	switch (data_type) {
+	case FSNOTIFY_EVENT_DENTRY:
+		/* Non const is needed for dget() */
+		return (struct dentry *)data;
+	case FSNOTIFY_EVENT_PATH:
+		return ((const struct path *)data)->dentry;
+	default:
+		return NULL;
+	}
+}
+
 static inline const struct path *fsnotify_data_path(const void *data,
 						    int data_type)
 {
-- 
cgit v1.2.3


From dabe729dddca550446e9cc118c96d1f91703345b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 25 Oct 2021 16:27:18 -0300
Subject: fsnotify: clarify contract for create event hooks

Clarify argument names and contract for fsnotify_create() and
fsnotify_mkdir() to reflect the anomaly of kernfs, which leaves dentries
negavite after mkdir/create.

Remove the WARN_ON(!inode) in audit code that were added by the Fixes
commit under the wrong assumption that dentries cannot be negative after
mkdir/create.

Fixes: aa93bdc5500c ("fsnotify: use helpers to access data by data_type")
Link: https://lore.kernel.org/linux-fsdevel/87mtp5yz0q.fsf@collabora.com/
Link: https://lore.kernel.org/r/20211025192746.66445-4-krisman@collabora.com
Reviewed-by: Jan Kara <jack@suse.cz>
Reported-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify.h | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index df0fa4687a18..1e5f7435a4b5 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -192,16 +192,22 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 
 /*
  * fsnotify_create - 'name' was linked in
+ *
+ * Caller must make sure that dentry->d_name is stable.
+ * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
+ * ->d_inode later
  */
-static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
+static inline void fsnotify_create(struct inode *dir, struct dentry *dentry)
 {
-	audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
+	audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);
 
-	fsnotify_dirent(inode, dentry, FS_CREATE);
+	fsnotify_dirent(dir, dentry, FS_CREATE);
 }
 
 /*
  * fsnotify_link - new hardlink in 'inode' directory
+ *
+ * Caller must make sure that new_dentry->d_name is stable.
  * Note: We have to pass also the linked inode ptr as some filesystems leave
  *   new_dentry->d_inode NULL and instantiate inode pointer later
  */
@@ -230,12 +236,16 @@ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
 
 /*
  * fsnotify_mkdir - directory 'name' was created
+ *
+ * Caller must make sure that dentry->d_name is stable.
+ * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
+ * ->d_inode later
  */
-static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
+static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry)
 {
-	audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
+	audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);
 
-	fsnotify_dirent(inode, dentry, FS_CREATE | FS_ISDIR);
+	fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR);
 }
 
 /*
-- 
cgit v1.2.3


From 808967a0a4d2f4ce6a2005c5692fffbecaf018c1 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 25 Oct 2021 16:27:23 -0300
Subject: fsnotify: Add helper to detect overflow_event

Similarly to fanotify_is_perm_event and friends, provide a helper
predicate to say whether a mask is of an overflow event.

Link: https://lore.kernel.org/r/20211025192746.66445-9-krisman@collabora.com
Suggested-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify_backend.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index a2db821e8a8f..749bc85e1d1c 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -510,6 +510,11 @@ static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
 	fsnotify_add_event(group, group->overflow_event, NULL, NULL);
 }
 
+static inline bool fsnotify_is_overflow_event(u32 mask)
+{
+	return mask & FS_Q_OVERFLOW;
+}
+
 static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
 {
 	assert_spin_locked(&group->notification_lock);
-- 
cgit v1.2.3


From 1ad03c3a326a86e259389592117252c851873395 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 25 Oct 2021 16:27:24 -0300
Subject: fsnotify: Add wrapper around fsnotify_add_event

fsnotify_add_event is growing in number of parameters, which in most
case are just passed a NULL pointer.  So, split out a new
fsnotify_insert_event function to clean things up for users who don't
need an insert hook.

Link: https://lore.kernel.org/r/20211025192746.66445-10-krisman@collabora.com
Suggested-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify_backend.h | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 749bc85e1d1c..b323d0c4b967 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -498,16 +498,25 @@ extern int fsnotify_fasync(int fd, struct file *file, int on);
 extern void fsnotify_destroy_event(struct fsnotify_group *group,
 				   struct fsnotify_event *event);
 /* attach the event to the group notification queue */
-extern int fsnotify_add_event(struct fsnotify_group *group,
-			      struct fsnotify_event *event,
-			      int (*merge)(struct fsnotify_group *,
-					   struct fsnotify_event *),
-			      void (*insert)(struct fsnotify_group *,
-					     struct fsnotify_event *));
+extern int fsnotify_insert_event(struct fsnotify_group *group,
+				 struct fsnotify_event *event,
+				 int (*merge)(struct fsnotify_group *,
+					      struct fsnotify_event *),
+				 void (*insert)(struct fsnotify_group *,
+						struct fsnotify_event *));
+
+static inline int fsnotify_add_event(struct fsnotify_group *group,
+				     struct fsnotify_event *event,
+				     int (*merge)(struct fsnotify_group *,
+						  struct fsnotify_event *))
+{
+	return fsnotify_insert_event(group, event, merge, NULL);
+}
+
 /* Queue overflow event to a notification group */
 static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
 {
-	fsnotify_add_event(group, group->overflow_event, NULL, NULL);
+	fsnotify_add_event(group, group->overflow_event, NULL);
 }
 
 static inline bool fsnotify_is_overflow_event(u32 mask)
-- 
cgit v1.2.3


From 29335033c574a15334015d8c4e36862cff3d3384 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 25 Oct 2021 16:27:25 -0300
Subject: fsnotify: Retrieve super block from the data field

Some file system events (i.e. FS_ERROR) might not be associated with an
inode or directory.  For these, we can retrieve the super block from the
data field.  But, since the super_block is available in the data field
on every event type, simplify the code to always retrieve it from there,
through a new helper.

Link: https://lore.kernel.org/r/20211025192746.66445-11-krisman@collabora.com
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify_backend.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index b323d0c4b967..035438fe4a43 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -289,6 +289,21 @@ static inline const struct path *fsnotify_data_path(const void *data,
 	}
 }
 
+static inline struct super_block *fsnotify_data_sb(const void *data,
+						   int data_type)
+{
+	switch (data_type) {
+	case FSNOTIFY_EVENT_INODE:
+		return ((struct inode *)data)->i_sb;
+	case FSNOTIFY_EVENT_DENTRY:
+		return ((struct dentry *)data)->d_sb;
+	case FSNOTIFY_EVENT_PATH:
+		return ((const struct path *)data)->dentry->d_sb;
+	default:
+		return NULL;
+	}
+}
+
 enum fsnotify_obj_type {
 	FSNOTIFY_OBJ_TYPE_INODE,
 	FSNOTIFY_OBJ_TYPE_PARENT,
-- 
cgit v1.2.3


From 24dca90590509a7a6cbe0650100c90c5b8a3468a Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 25 Oct 2021 16:27:26 -0300
Subject: fsnotify: Protect fsnotify_handle_inode_event from no-inode events

FAN_FS_ERROR allows events without inodes - i.e. for file system-wide
errors.  Even though fsnotify_handle_inode_event is not currently used
by fanotify, this patch protects other backends from cases where neither
inode or dir are provided.  Also document the constraints of the
interface (inode and dir cannot be both NULL).

Link: https://lore.kernel.org/r/20211025192746.66445-12-krisman@collabora.com
Suggested-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify_backend.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 035438fe4a43..b71dc788018e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -136,6 +136,7 @@ struct mem_cgroup;
  * @dir:	optional directory associated with event -
  *		if @file_name is not NULL, this is the directory that
  *		@file_name is relative to.
+ *		Either @inode or @dir must be non-NULL.
  * @file_name:	optional file name associated with event
  * @cookie:	inotify rename cookie
  *
-- 
cgit v1.2.3


From 330ae77d2a5b0af32c0f29e139bf28ec8591de59 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 25 Oct 2021 16:27:27 -0300
Subject: fsnotify: Pass group argument to free_event

For group-wide mempool backed events, like FS_ERROR, the free_event
callback will need to reference the group's mempool to free the memory.
Wire that argument into the current callers.

Link: https://lore.kernel.org/r/20211025192746.66445-13-krisman@collabora.com
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify_backend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index b71dc788018e..3a7c31436182 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -156,7 +156,7 @@ struct fsnotify_ops {
 			    const struct qstr *file_name, u32 cookie);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
-	void (*free_event)(struct fsnotify_event *event);
+	void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	/* called on final put+free to free memory */
 	void (*free_mark)(struct fsnotify_mark *mark);
 };
-- 
cgit v1.2.3


From 4fe595cf1c80e7a5af4d00c4da29def64aff57a2 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 25 Oct 2021 16:27:31 -0300
Subject: fanotify: Require fid_mode for any non-fd event

Like inode events, FAN_FS_ERROR will require fid mode.  Therefore,
convert the verification during fanotify_mark(2) to require fid for any
non-fd event.  This means fid_mode will not only be required for inode
events, but for any event that doesn't provide a descriptor.

Link: https://lore.kernel.org/r/20211025192746.66445-17-krisman@collabora.com
Suggested-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fanotify.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index eec3b7c40811..52d464802d99 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -84,6 +84,9 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
  */
 #define FANOTIFY_DIRENT_EVENTS	(FAN_MOVE | FAN_CREATE | FAN_DELETE)
 
+/* Events that can be reported with event->fd */
+#define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS)
+
 /* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */
 #define FANOTIFY_INODE_EVENTS	(FANOTIFY_DIRENT_EVENTS | \
 				 FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF)
-- 
cgit v1.2.3


From 9daa811073fa19c08e8aad3b90f9235fed161acf Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 25 Oct 2021 16:27:32 -0300
Subject: fsnotify: Support FS_ERROR event type

Expose a new type of fsnotify event for filesystems to report errors for
userspace monitoring tools.  fanotify will send this type of
notification for FAN_FS_ERROR events.  This also introduce a helper for
generating the new event.

Link: https://lore.kernel.org/r/20211025192746.66445-18-krisman@collabora.com
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify.h         | 13 +++++++++++++
 include/linux/fsnotify_backend.h | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 1e5f7435a4b5..787545e87eeb 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -339,4 +339,17 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		fsnotify_dentry(dentry, mask);
 }
 
+static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode,
+				    int error)
+{
+	struct fs_error_report report = {
+		.error = error,
+		.inode = inode,
+		.sb = sb,
+	};
+
+	return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR,
+			NULL, NULL, NULL, 0);
+}
+
 #endif	/* _LINUX_FS_NOTIFY_H */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 3a7c31436182..00dbaafbcf95 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -42,6 +42,12 @@
 
 #define FS_UNMOUNT		0x00002000	/* inode on umount fs */
 #define FS_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
+#define FS_ERROR		0x00008000	/* Filesystem Error (fanotify) */
+
+/*
+ * FS_IN_IGNORED overloads FS_ERROR.  It is only used internally by inotify
+ * which does not support FS_ERROR.
+ */
 #define FS_IN_IGNORED		0x00008000	/* last inotify event here */
 
 #define FS_OPEN_PERM		0x00010000	/* open event in an permission hook */
@@ -95,7 +101,8 @@
 #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
 			     FS_EVENTS_POSS_ON_CHILD | \
 			     FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \
-			     FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED)
+			     FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
+			     FS_ERROR)
 
 /* Extra flags that may be reported with event or control handling of events */
 #define ALL_FSNOTIFY_FLAGS  (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \
@@ -250,6 +257,13 @@ enum fsnotify_data_type {
 	FSNOTIFY_EVENT_PATH,
 	FSNOTIFY_EVENT_INODE,
 	FSNOTIFY_EVENT_DENTRY,
+	FSNOTIFY_EVENT_ERROR,
+};
+
+struct fs_error_report {
+	int error;
+	struct inode *inode;
+	struct super_block *sb;
 };
 
 static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
@@ -261,6 +275,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
 		return d_inode(data);
 	case FSNOTIFY_EVENT_PATH:
 		return d_inode(((const struct path *)data)->dentry);
+	case FSNOTIFY_EVENT_ERROR:
+		return ((struct fs_error_report *)data)->inode;
 	default:
 		return NULL;
 	}
@@ -300,6 +316,20 @@ static inline struct super_block *fsnotify_data_sb(const void *data,
 		return ((struct dentry *)data)->d_sb;
 	case FSNOTIFY_EVENT_PATH:
 		return ((const struct path *)data)->dentry->d_sb;
+	case FSNOTIFY_EVENT_ERROR:
+		return ((struct fs_error_report *) data)->sb;
+	default:
+		return NULL;
+	}
+}
+
+static inline struct fs_error_report *fsnotify_data_error_report(
+							const void *data,
+							int data_type)
+{
+	switch (data_type) {
+	case FSNOTIFY_EVENT_ERROR:
+		return (struct fs_error_report *) data;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 734a1a5eccc5f7473002b0669f788e135f1f64aa Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 25 Oct 2021 16:27:34 -0300
Subject: fanotify: Pre-allocate pool of error events

Pre-allocate slots for file system errors to have greater chances of
succeeding, since error events can happen in GFP_NOFS context.  This
patch introduces a group-wide mempool of error events, shared by all
FAN_FS_ERROR marks in this group.

Link: https://lore.kernel.org/r/20211025192746.66445-20-krisman@collabora.com
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify_backend.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 00dbaafbcf95..51ef2b079bfa 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -19,6 +19,7 @@
 #include <linux/atomic.h>
 #include <linux/user_namespace.h>
 #include <linux/refcount.h>
+#include <linux/mempool.h>
 
 /*
  * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
@@ -246,6 +247,7 @@ struct fsnotify_group {
 			int flags;           /* flags from fanotify_init() */
 			int f_flags; /* event_f_flags from fanotify_init() */
 			struct ucounts *ucounts;
+			mempool_t error_events_pool;
 		} fanotify_data;
 #endif /* CONFIG_FANOTIFY */
 	};
-- 
cgit v1.2.3


From 9709bd548f11a092d124698118013f66e1740f9b Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 25 Oct 2021 16:27:43 -0300
Subject: fanotify: Allow users to request FAN_FS_ERROR events

Wire up the FAN_FS_ERROR event in the fanotify_mark syscall, allowing
user space to request the monitoring of FAN_FS_ERROR events.

These events are limited to filesystem marks, so check it is the
case in the syscall handler.

Link: https://lore.kernel.org/r/20211025192746.66445-29-krisman@collabora.com
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fanotify.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 52d464802d99..616af2ea20f3 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -91,9 +91,13 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
 #define FANOTIFY_INODE_EVENTS	(FANOTIFY_DIRENT_EVENTS | \
 				 FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF)
 
+/* Events that can only be reported with data type FSNOTIFY_EVENT_ERROR */
+#define FANOTIFY_ERROR_EVENTS	(FAN_FS_ERROR)
+
 /* Events that user can request to be notified on */
 #define FANOTIFY_EVENTS		(FANOTIFY_PATH_EVENTS | \
-				 FANOTIFY_INODE_EVENTS)
+				 FANOTIFY_INODE_EVENTS | \
+				 FANOTIFY_ERROR_EVENTS)
 
 /* Events that require a permission response from user */
 #define FANOTIFY_PERM_EVENTS	(FAN_OPEN_PERM | FAN_ACCESS_PERM | \
-- 
cgit v1.2.3


From 1bb6b81029456f4e2e6727c5167f43bdfc34bee5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 27 Oct 2021 13:21:07 +0100
Subject: block: avoid extra iter advance with async iocb

Nobody cares about iov iterators state if we return -EIOCBQUEUED, so as
the we now have __blkdev_direct_IO_async(), which gets pages only once,
we can skip expensive iov_iter_advance(). It's around 1-2% of all CPU
spent.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a6158edfbfa2ae3bc24aed29a72f035df18fad2f.1635337135.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index c88700d1bdc3..fe6bdfbbef66 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -417,6 +417,7 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
 void __bio_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
+void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter);
 void __bio_release_pages(struct bio *bio, bool mark_dirty);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
-- 
cgit v1.2.3


From 5460601de590158b37619f8e18b678aa18da6345 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galpress@amazon.com>
Date: Tue, 12 Oct 2021 15:09:01 +0300
Subject: dma-buf: Fix pin callback comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pin callback does not necessarily have to move the memory to system
memory, remove the sentence from the comment.

Link: https://lore.kernel.org/r/20211012120903.96933-2-galpress@amazon.com
Signed-off-by: Gal Pressman <galpress@amazon.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/dma-buf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 8b32b4bdd590..8afe182fe8f2 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -86,8 +86,8 @@ struct dma_buf_ops {
 	 * @pin:
 	 *
 	 * This is called by dma_buf_pin() and lets the exporter know that the
-	 * DMA-buf can't be moved any more. The exporter should pin the buffer
-	 * into system memory to make sure it is generally accessible by other
+	 * DMA-buf can't be moved any more. Ideally, the exporter should
+	 * pin the buffer so that it is generally accessible by all
 	 * devices.
 	 *
 	 * This is called with the &dmabuf.resv object locked and is mutual
-- 
cgit v1.2.3


From 3ab7b6ac5d829e60c3b89d415811ff1c9f358c8e Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Mon, 25 Oct 2021 10:09:23 -0700
Subject: pwm: Introduce single-PWM of_xlate function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing pxa driver and the upcoming addition of PWM support in the
TI sn565dsi86 DSI/eDP bridge driver both has a single PWM channel and
thereby a need for a of_xlate function with the period as its single
argument.

Introduce a common helper function in the core that can be used as
of_xlate by such drivers and migrate the pxa driver to use this.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Tested-by: Steev Klimaszewski <steev@kali.org>
Tested-By: Steev Klimaszewski <steev@kali.org>
Signed-off-by: Robert Foss <robert.foss@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20211025170925.3096444-1-bjorn.andersson@linaro.org
---
 include/linux/pwm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 725c9b784e60..dd51d4931fdc 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -414,6 +414,8 @@ struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
 
 struct pwm_device *of_pwm_xlate_with_flags(struct pwm_chip *pc,
 		const struct of_phandle_args *args);
+struct pwm_device *of_pwm_single_xlate(struct pwm_chip *pc,
+				       const struct of_phandle_args *args);
 
 struct pwm_device *pwm_get(struct device *dev, const char *con_id);
 struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np,
-- 
cgit v1.2.3


From ce5e48036c9e76a2a5bd4d9079eac273087a533a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B4=87?= <yun.wang@linux.alibaba.com>
Date: Wed, 27 Oct 2021 11:14:44 +0800
Subject: ftrace: disable preemption when recursion locked

As the documentation explained, ftrace_test_recursion_trylock()
and ftrace_test_recursion_unlock() were supposed to disable and
enable preemption properly, however currently this work is done
outside of the function, which could be missing by mistake.

And since the internal using of trace_test_and_set_recursion()
and trace_clear_recursion() also require preemption disabled, we
can just merge the logical.

This patch will make sure the preemption has been disabled when
trace_test_and_set_recursion() return bit >= 0, and
trace_clear_recursion() will enable the preemption if previously
enabled.

Link: https://lkml.kernel.org/r/13bde807-779c-aa4c-0672-20515ae365ea@linux.alibaba.com

CC: Petr Mladek <pmladek@suse.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Jisheng Zhang <jszhang@kernel.org>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Miroslav Benes <mbenes@suse.cz>
Reported-by: Abaci <abaci@linux.alibaba.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
[ Removed extra line in comment - SDR ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_recursion.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index 24f284eb55a7..a13f23b04d73 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -155,6 +155,9 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
 # define do_ftrace_record_recursion(ip, pip)	do { } while (0)
 #endif
 
+/*
+ * Preemption is promised to be disabled when return bit >= 0.
+ */
 static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip,
 							int start, int max)
 {
@@ -189,14 +192,20 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 	current->trace_recursion = val;
 	barrier();
 
+	preempt_disable_notrace();
+
 	return bit + 1;
 }
 
+/*
+ * Preemption will be enabled (if it was previously enabled).
+ */
 static __always_inline void trace_clear_recursion(int bit)
 {
 	if (!bit)
 		return;
 
+	preempt_enable_notrace();
 	barrier();
 	bit--;
 	trace_recursion_clear(bit);
@@ -209,7 +218,7 @@ static __always_inline void trace_clear_recursion(int bit)
  * tracing recursed in the same context (normal vs interrupt),
  *
  * Returns: -1 if a recursion happened.
- *           >= 0 if no recursion
+ *           >= 0 if no recursion.
  */
 static __always_inline int ftrace_test_recursion_trylock(unsigned long ip,
 							 unsigned long parent_ip)
-- 
cgit v1.2.3


From e531e90b5ab0f7ce5ff298e165214c1aec6ed187 Mon Sep 17 00:00:00 2001
From: "Robin H. Johnson" <robbat2@gentoo.org>
Date: Mon, 30 Aug 2021 21:37:23 -0700
Subject: tracing: Increase PERF_MAX_TRACE_SIZE to handle Sentinel1 and docker
 together

Running endpoint security solutions like Sentinel1 that use perf-based
tracing heavily lead to this repeated dump complaining about dockerd.
The default value of 2048 is nowhere near not large enough.

Using the prior patch "tracing: show size of requested buffer", we get
"perf buffer not large enough, wanted 6644, have 6144", after repeated
up-sizing (I did 2/4/6/8K). With 8K, the problem doesn't occur at all,
so below is the trace for 6K.

I'm wondering if this value should be selectable at boot time, but this
is a good starting point.

```
------------[ cut here ]------------
perf buffer not large enough, wanted 6644, have 6144
WARNING: CPU: 1 PID: 4997 at kernel/trace/trace_event_perf.c:402 perf_trace_buf_alloc+0x8c/0xa0
Modules linked in: [..]
CPU: 1 PID: 4997 Comm: sh Tainted: G                T 5.13.13-x86_64-00039-gb3959163488e #63
Hardware name: LENOVO 20KH002JUS/20KH002JUS, BIOS N23ET66W (1.41 ) 09/02/2019
RIP: 0010:perf_trace_buf_alloc+0x8c/0xa0
Code: 80 3d 43 97 d0 01 00 74 07 31 c0 5b 5d 41 5c c3 ba 00 18 00 00 89 ee 48 c7 c7 00 82 7d 91 c6 05 25 97 d0 01 01 e8 22 ee bc 00 <0f> 0b 31 c0 eb db 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 55 89
RSP: 0018:ffffb922026b7d58 EFLAGS: 00010282
RAX: 0000000000000000 RBX: ffff9da5ee012000 RCX: 0000000000000027
RDX: ffff9da881657828 RSI: 0000000000000001 RDI: ffff9da881657820
RBP: 00000000000019f4 R08: 0000000000000000 R09: ffffb922026b7b80
R10: ffffb922026b7b78 R11: ffffffff91dda688 R12: 000000000000000f
R13: ffff9da5ee012108 R14: ffff9da8816570a0 R15: ffffb922026b7e30
FS:  00007f420db1a080(0000) GS:ffff9da881640000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000060 CR3: 00000002504a8006 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 kprobe_perf_func+0x11e/0x270
 ? do_execveat_common.isra.0+0x1/0x1c0
 ? do_execveat_common.isra.0+0x5/0x1c0
 kprobe_ftrace_handler+0x10e/0x1d0
 0xffffffffc03aa0c8
 ? do_execveat_common.isra.0+0x1/0x1c0
 do_execveat_common.isra.0+0x5/0x1c0
 __x64_sys_execve+0x33/0x40
 do_syscall_64+0x6b/0xc0
 ? do_syscall_64+0x11/0xc0
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f420dc1db37
Code: ff ff 76 e7 f7 d8 64 41 89 00 eb df 0f 1f 80 00 00 00 00 f7 d8 64 41 89 00 eb dc 0f 1f 84 00 00 00 00 00 b8 3b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 01 43 0f 00 f7 d8 64 89 01 48
RSP: 002b:00007ffd4e8b4e38 EFLAGS: 00000246 ORIG_RAX: 000000000000003b
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f420dc1db37
RDX: 0000564338d1e740 RSI: 0000564338d32d50 RDI: 0000564338d28f00
RBP: 0000564338d28f00 R08: 0000564338d32d50 R09: 0000000000000020
R10: 00000000000001b6 R11: 0000000000000246 R12: 0000564338d28f00
R13: 0000564338d32d50 R14: 0000564338d1e740 R15: 0000564338d28c60
---[ end trace 83ab3e8e16275e49 ]---
```

Link: https://lkml.kernel.org/r/20210831043723.13481-2-robbat2@gentoo.org

Signed-off-by: Robin H. Johnson <robbat2@gentoo.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 3e475eeb5a99..50453b287615 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -671,7 +671,7 @@ struct trace_event_file {
 	}								\
 	early_initcall(trace_init_perf_perm_##name);
 
-#define PERF_MAX_TRACE_SIZE	2048
+#define PERF_MAX_TRACE_SIZE	8192
 
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
 
-- 
cgit v1.2.3


From f941eadd8d6d4ee2f8c9aeab8e1da5e647533a7d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Oct 2021 14:41:31 -0700
Subject: bpf: Avoid races in __bpf_prog_run() for 32bit arches

__bpf_prog_run() can run from non IRQ contexts, meaning
it could be re entered if interrupted.

This calls for the irq safe variant of u64_stats_update_{begin|end},
or risk a deadlock.

This patch is a nop on 64bit arches, fortunately.

syzbot report:

WARNING: inconsistent lock state
5.12.0-rc3-syzkaller #0 Not tainted
--------------------------------
inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
udevd/4013 [HC0[0]:SC0[0]:HE1:SE1] takes:
ff7c9dec (&(&pstats->syncp)->seq){+.?.}-{0:0}, at: sk_filter include/linux/filter.h:867 [inline]
ff7c9dec (&(&pstats->syncp)->seq){+.?.}-{0:0}, at: do_one_broadcast net/netlink/af_netlink.c:1468 [inline]
ff7c9dec (&(&pstats->syncp)->seq){+.?.}-{0:0}, at: netlink_broadcast_filtered+0x27c/0x4fc net/netlink/af_netlink.c:1520
{IN-SOFTIRQ-W} state was registered at:
  lock_acquire.part.0+0xf0/0x41c kernel/locking/lockdep.c:5510
  lock_acquire+0x6c/0x74 kernel/locking/lockdep.c:5483
  do_write_seqcount_begin_nested include/linux/seqlock.h:520 [inline]
  do_write_seqcount_begin include/linux/seqlock.h:545 [inline]
  u64_stats_update_begin include/linux/u64_stats_sync.h:129 [inline]
  bpf_prog_run_pin_on_cpu include/linux/filter.h:624 [inline]
  bpf_prog_run_clear_cb+0x1bc/0x270 include/linux/filter.h:755
  run_filter+0xa0/0x17c net/packet/af_packet.c:2031
  packet_rcv+0xc0/0x3e0 net/packet/af_packet.c:2104
  dev_queue_xmit_nit+0x2bc/0x39c net/core/dev.c:2387
  xmit_one net/core/dev.c:3588 [inline]
  dev_hard_start_xmit+0x94/0x518 net/core/dev.c:3609
  sch_direct_xmit+0x11c/0x1f0 net/sched/sch_generic.c:313
  qdisc_restart net/sched/sch_generic.c:376 [inline]
  __qdisc_run+0x194/0x7f8 net/sched/sch_generic.c:384
  qdisc_run include/net/pkt_sched.h:136 [inline]
  qdisc_run include/net/pkt_sched.h:128 [inline]
  __dev_xmit_skb net/core/dev.c:3795 [inline]
  __dev_queue_xmit+0x65c/0xf84 net/core/dev.c:4150
  dev_queue_xmit+0x14/0x18 net/core/dev.c:4215
  neigh_resolve_output net/core/neighbour.c:1491 [inline]
  neigh_resolve_output+0x170/0x228 net/core/neighbour.c:1471
  neigh_output include/net/neighbour.h:510 [inline]
  ip6_finish_output2+0x2e4/0x9fc net/ipv6/ip6_output.c:117
  __ip6_finish_output net/ipv6/ip6_output.c:182 [inline]
  __ip6_finish_output+0x164/0x3f8 net/ipv6/ip6_output.c:161
  ip6_finish_output+0x2c/0xb0 net/ipv6/ip6_output.c:192
  NF_HOOK_COND include/linux/netfilter.h:290 [inline]
  ip6_output+0x74/0x294 net/ipv6/ip6_output.c:215
  dst_output include/net/dst.h:448 [inline]
  NF_HOOK include/linux/netfilter.h:301 [inline]
  NF_HOOK include/linux/netfilter.h:295 [inline]
  mld_sendpack+0x2a8/0x7e4 net/ipv6/mcast.c:1679
  mld_send_cr net/ipv6/mcast.c:1975 [inline]
  mld_ifc_timer_expire+0x1e8/0x494 net/ipv6/mcast.c:2474
  call_timer_fn+0xd0/0x570 kernel/time/timer.c:1431
  expire_timers kernel/time/timer.c:1476 [inline]
  __run_timers kernel/time/timer.c:1745 [inline]
  run_timer_softirq+0x2e4/0x384 kernel/time/timer.c:1758
  __do_softirq+0x204/0x7ac kernel/softirq.c:345
  do_softirq_own_stack include/asm-generic/softirq_stack.h:10 [inline]
  invoke_softirq kernel/softirq.c:228 [inline]
  __irq_exit_rcu+0x1d8/0x200 kernel/softirq.c:422
  irq_exit+0x10/0x3c kernel/softirq.c:446
  __handle_domain_irq+0xb4/0x120 kernel/irq/irqdesc.c:692
  handle_domain_irq include/linux/irqdesc.h:176 [inline]
  gic_handle_irq+0x84/0xac drivers/irqchip/irq-gic.c:370
  __irq_svc+0x5c/0x94 arch/arm/kernel/entry-armv.S:205
  debug_smp_processor_id+0x0/0x24 lib/smp_processor_id.c:53
  rcu_read_lock_held_common kernel/rcu/update.c:108 [inline]
  rcu_read_lock_sched_held+0x24/0x7c kernel/rcu/update.c:123
  trace_lock_acquire+0x24c/0x278 include/trace/events/lock.h:13
  lock_acquire+0x3c/0x74 kernel/locking/lockdep.c:5481
  rcu_lock_acquire include/linux/rcupdate.h:267 [inline]
  rcu_read_lock include/linux/rcupdate.h:656 [inline]
  avc_has_perm_noaudit+0x6c/0x260 security/selinux/avc.c:1150
  selinux_inode_permission+0x140/0x220 security/selinux/hooks.c:3141
  security_inode_permission+0x44/0x60 security/security.c:1268
  inode_permission.part.0+0x5c/0x13c fs/namei.c:521
  inode_permission fs/namei.c:494 [inline]
  may_lookup fs/namei.c:1652 [inline]
  link_path_walk.part.0+0xd4/0x38c fs/namei.c:2208
  link_path_walk fs/namei.c:2189 [inline]
  path_lookupat+0x3c/0x1b8 fs/namei.c:2419
  filename_lookup+0xa8/0x1a4 fs/namei.c:2453
  user_path_at_empty+0x74/0x90 fs/namei.c:2733
  do_readlinkat+0x5c/0x12c fs/stat.c:417
  __do_sys_readlink fs/stat.c:450 [inline]
  sys_readlink+0x24/0x28 fs/stat.c:447
  ret_fast_syscall+0x0/0x2c arch/arm/mm/proc-v7.S:64
  0x7eaa4974
irq event stamp: 298277
hardirqs last  enabled at (298277): [<802000d0>] no_work_pending+0x4/0x34
hardirqs last disabled at (298276): [<8020c9b8>] do_work_pending+0x9c/0x648 arch/arm/kernel/signal.c:676
softirqs last  enabled at (298216): [<8020167c>] __do_softirq+0x584/0x7ac kernel/softirq.c:372
softirqs last disabled at (298201): [<8024dff4>] do_softirq_own_stack include/asm-generic/softirq_stack.h:10 [inline]
softirqs last disabled at (298201): [<8024dff4>] invoke_softirq kernel/softirq.c:228 [inline]
softirqs last disabled at (298201): [<8024dff4>] __irq_exit_rcu+0x1d8/0x200 kernel/softirq.c:422

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(&(&pstats->syncp)->seq);
  <Interrupt>
    lock(&(&pstats->syncp)->seq);

 *** DEADLOCK ***

1 lock held by udevd/4013:
 #0: 82b09c5c (rcu_read_lock){....}-{1:2}, at: sk_filter_trim_cap+0x54/0x434 net/core/filter.c:139

stack backtrace:
CPU: 1 PID: 4013 Comm: udevd Not tainted 5.12.0-rc3-syzkaller #0
Hardware name: ARM-Versatile Express
Backtrace:
[<81802550>] (dump_backtrace) from [<818027c4>] (show_stack+0x18/0x1c arch/arm/kernel/traps.c:252)
 r7:00000080 r6:600d0093 r5:00000000 r4:82b58344
[<818027ac>] (show_stack) from [<81809e98>] (__dump_stack lib/dump_stack.c:79 [inline])
[<818027ac>] (show_stack) from [<81809e98>] (dump_stack+0xb8/0xe8 lib/dump_stack.c:120)
[<81809de0>] (dump_stack) from [<81804a00>] (print_usage_bug.part.0+0x228/0x230 kernel/locking/lockdep.c:3806)
 r7:86bcb768 r6:81a0326c r5:830f96a8 r4:86bcb0c0
[<818047d8>] (print_usage_bug.part.0) from [<802bb1b8>] (print_usage_bug kernel/locking/lockdep.c:3776 [inline])
[<818047d8>] (print_usage_bug.part.0) from [<802bb1b8>] (valid_state kernel/locking/lockdep.c:3818 [inline])
[<818047d8>] (print_usage_bug.part.0) from [<802bb1b8>] (mark_lock_irq kernel/locking/lockdep.c:4021 [inline])
[<818047d8>] (print_usage_bug.part.0) from [<802bb1b8>] (mark_lock.part.0+0xc34/0x136c kernel/locking/lockdep.c:4478)
 r10:83278fe8 r9:82c6d748 r8:00000000 r7:82c6d2d4 r6:00000004 r5:86bcb768
 r4:00000006
[<802ba584>] (mark_lock.part.0) from [<802bc644>] (mark_lock kernel/locking/lockdep.c:4442 [inline])
[<802ba584>] (mark_lock.part.0) from [<802bc644>] (mark_usage kernel/locking/lockdep.c:4391 [inline])
[<802ba584>] (mark_lock.part.0) from [<802bc644>] (__lock_acquire+0x9bc/0x3318 kernel/locking/lockdep.c:4854)
 r10:86bcb768 r9:86bcb0c0 r8:00000001 r7:00040000 r6:0000075a r5:830f96a8
 r4:00000000
[<802bbc88>] (__lock_acquire) from [<802bfb90>] (lock_acquire.part.0+0xf0/0x41c kernel/locking/lockdep.c:5510)
 r10:00000000 r9:600d0013 r8:00000000 r7:00000000 r6:828a2680 r5:828a2680
 r4:861e5bc8
[<802bfaa0>] (lock_acquire.part.0) from [<802bff28>] (lock_acquire+0x6c/0x74 kernel/locking/lockdep.c:5483)
 r10:8146137c r9:00000000 r8:00000001 r7:00000000 r6:00000000 r5:00000000
 r4:ff7c9dec
[<802bfebc>] (lock_acquire) from [<81381eb4>] (do_write_seqcount_begin_nested include/linux/seqlock.h:520 [inline])
[<802bfebc>] (lock_acquire) from [<81381eb4>] (do_write_seqcount_begin include/linux/seqlock.h:545 [inline])
[<802bfebc>] (lock_acquire) from [<81381eb4>] (u64_stats_update_begin include/linux/u64_stats_sync.h:129 [inline])
[<802bfebc>] (lock_acquire) from [<81381eb4>] (__bpf_prog_run_save_cb include/linux/filter.h:727 [inline])
[<802bfebc>] (lock_acquire) from [<81381eb4>] (bpf_prog_run_save_cb include/linux/filter.h:741 [inline])
[<802bfebc>] (lock_acquire) from [<81381eb4>] (sk_filter_trim_cap+0x26c/0x434 net/core/filter.c:149)
 r10:a4095dd0 r9:ff7c9dd0 r8:e44be000 r7:8146137c r6:00000001 r5:8611ba80
 r4:00000000
[<81381c48>] (sk_filter_trim_cap) from [<8146137c>] (sk_filter include/linux/filter.h:867 [inline])
[<81381c48>] (sk_filter_trim_cap) from [<8146137c>] (do_one_broadcast net/netlink/af_netlink.c:1468 [inline])
[<81381c48>] (sk_filter_trim_cap) from [<8146137c>] (netlink_broadcast_filtered+0x27c/0x4fc net/netlink/af_netlink.c:1520)
 r10:00000001 r9:833d6b1c r8:00000000 r7:8572f864 r6:8611ba80 r5:8698d800
 r4:8572f800
[<81461100>] (netlink_broadcast_filtered) from [<81463e60>] (netlink_broadcast net/netlink/af_netlink.c:1544 [inline])
[<81461100>] (netlink_broadcast_filtered) from [<81463e60>] (netlink_sendmsg+0x3d0/0x478 net/netlink/af_netlink.c:1925)
 r10:00000000 r9:00000002 r8:8698d800 r7:000000b7 r6:8611b900 r5:861e5f50
 r4:86aa3000
[<81463a90>] (netlink_sendmsg) from [<81321f54>] (sock_sendmsg_nosec net/socket.c:654 [inline])
[<81463a90>] (netlink_sendmsg) from [<81321f54>] (sock_sendmsg+0x3c/0x4c net/socket.c:674)
 r10:00000000 r9:861e5dd4 r8:00000000 r7:86570000 r6:00000000 r5:86570000
 r4:861e5f50
[<81321f18>] (sock_sendmsg) from [<813234d0>] (____sys_sendmsg+0x230/0x29c net/socket.c:2350)
 r5:00000040 r4:861e5f50
[<813232a0>] (____sys_sendmsg) from [<8132549c>] (___sys_sendmsg+0xac/0xe4 net/socket.c:2404)
 r10:00000128 r9:861e4000 r8:00000000 r7:00000000 r6:86570000 r5:861e5f50
 r4:00000000
[<813253f0>] (___sys_sendmsg) from [<81325684>] (__sys_sendmsg net/socket.c:2433 [inline])
[<813253f0>] (___sys_sendmsg) from [<81325684>] (__do_sys_sendmsg net/socket.c:2442 [inline])
[<813253f0>] (___sys_sendmsg) from [<81325684>] (sys_sendmsg+0x58/0xa0 net/socket.c:2440)
 r8:80200224 r7:00000128 r6:00000000 r5:7eaa541c r4:86570000
[<8132562c>] (sys_sendmsg) from [<80200060>] (ret_fast_syscall+0x0/0x2c arch/arm/mm/proc-v7.S:64)
Exception stack(0x861e5fa8 to 0x861e5ff0)
5fa0:                   00000000 00000000 0000000c 7eaa541c 00000000 00000000
5fc0: 00000000 00000000 76fbf840 00000128 00000000 0000008f 7eaa541c 000563f8
5fe0: 00056110 7eaa53e0 00036cec 76c9bf44
 r6:76fbf840 r5:00000000 r4:00000000

Fixes: 492ecee892c2 ("bpf: enable program stats")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211026214133.3114279-2-eric.dumazet@gmail.com
---
 include/linux/filter.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 47f80adbe744..2fffe9cc50f9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -612,13 +612,14 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
 		struct bpf_prog_stats *stats;
 		u64 start = sched_clock();
+		unsigned long flags;
 
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
 		stats = this_cpu_ptr(prog->stats);
-		u64_stats_update_begin(&stats->syncp);
+		flags = u64_stats_update_begin_irqsave(&stats->syncp);
 		stats->cnt++;
 		stats->nsecs += sched_clock() - start;
-		u64_stats_update_end(&stats->syncp);
+		u64_stats_update_end_irqrestore(&stats->syncp, flags);
 	} else {
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
 	}
-- 
cgit v1.2.3


From 61a0abaee2092eee69e44fe60336aa2f5b578938 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Oct 2021 14:41:33 -0700
Subject: bpf: Use u64_stats_t in struct bpf_prog_stats

Commit 316580b69d0a ("u64_stats: provide u64_stats_t type")
fixed possible load/store tearing on 64bit arches.

For instance the following C code

stats->nsecs += sched_clock() - start;

Could be rightfully implemented like this by a compiler,
confusing concurrent readers a lot:

stats->nsecs += sched_clock();
// arbitrary delay
stats->nsecs -= start;

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211026214133.3114279-4-eric.dumazet@gmail.com
---
 include/linux/filter.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2fffe9cc50f9..9782e3245852 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -553,9 +553,9 @@ struct bpf_binary_header {
 };
 
 struct bpf_prog_stats {
-	u64 cnt;
-	u64 nsecs;
-	u64 misses;
+	u64_stats_t cnt;
+	u64_stats_t nsecs;
+	u64_stats_t misses;
 	struct u64_stats_sync syncp;
 } __aligned(2 * sizeof(u64));
 
@@ -617,8 +617,8 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
 		stats = this_cpu_ptr(prog->stats);
 		flags = u64_stats_update_begin_irqsave(&stats->syncp);
-		stats->cnt++;
-		stats->nsecs += sched_clock() - start;
+		u64_stats_inc(&stats->cnt);
+		u64_stats_add(&stats->nsecs, sched_clock() - start);
 		u64_stats_update_end_irqrestore(&stats->syncp, flags);
 	} else {
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
-- 
cgit v1.2.3


From 259714100d98b50bf04d36a21bf50ca8b829fc11 Mon Sep 17 00:00:00 2001
From: Chunfeng Yun <chunfeng.yun@mediatek.com>
Date: Mon, 25 Oct 2021 15:01:53 +0800
Subject: PM / wakeirq: support enabling wake-up irq after runtime_suspend
 called

When the dedicated wake IRQ is level trigger, and it uses the
device's low-power status as the wakeup source, that means if the
device is not in low-power state, the wake IRQ will be triggered
if enabled; For this case, need enable the wake IRQ after running
the device's ->runtime_suspend() which make it enter low-power state.

e.g.
Assume the wake IRQ is a low level trigger type, and the wakeup
signal comes from the low-power status of the device.
The wakeup signal is low level at running time (0), and becomes
high level when the device enters low-power state (runtime_suspend
(1) is called), a wakeup event at (2) make the device exit low-power
state, then the wakeup signal also becomes low level.

                ------------------
               |           ^     ^|
----------------           |     | --------------
 |<---(0)--->|<--(1)--|   (3)   (2)    (4)

if enable the wake IRQ before running runtime_suspend during (0),
a wake IRQ will arise, it causes resume immediately;
it works if enable wake IRQ ( e.g. at (3) or (4)) after running
->runtime_suspend().

This patch introduces a new status WAKE_IRQ_DEDICATED_REVERSE to
optionally support enabling wake IRQ after running ->runtime_suspend().

Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chunfeng Yun <chunfeng.yun@mediatek.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_wakeirq.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_wakeirq.h b/include/linux/pm_wakeirq.h
index cd5b62db9084..e63a63aa47a3 100644
--- a/include/linux/pm_wakeirq.h
+++ b/include/linux/pm_wakeirq.h
@@ -17,8 +17,8 @@
 #ifdef CONFIG_PM
 
 extern int dev_pm_set_wake_irq(struct device *dev, int irq);
-extern int dev_pm_set_dedicated_wake_irq(struct device *dev,
-					 int irq);
+extern int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq);
+extern int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq);
 extern void dev_pm_clear_wake_irq(struct device *dev);
 extern void dev_pm_enable_wake_irq(struct device *dev);
 extern void dev_pm_disable_wake_irq(struct device *dev);
@@ -35,6 +35,11 @@ static inline int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
 	return 0;
 }
 
+static inline int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq)
+{
+	return 0;
+}
+
 static inline void dev_pm_clear_wake_irq(struct device *dev)
 {
 }
-- 
cgit v1.2.3


From 570b1cac477643cbf01a45fa5d018430a1fddbce Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Tue, 26 Oct 2021 22:40:12 +0800
Subject: block: Add a helper to validate the block size

There are some duplicated codes to validate the block
size in block drivers. This limitation actually comes
from block layer, so this patch tries to add a new block
layer helper for that.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Link: https://lore.kernel.org/r/20211026144015.188-2-xieyongji@bytedance.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6d95a4b36cfa..d2d627e2c782 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -44,6 +44,14 @@ struct blk_crypto_profile;
  */
 #define BLKCG_MAX_POLS		6
 
+static inline int blk_validate_block_size(unsigned int bsize)
+{
+	if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
+		return -EINVAL;
+
+	return 0;
+}
+
 static inline bool blk_op_is_passthrough(unsigned int op)
 {
 	op &= REQ_OP_MASK;
-- 
cgit v1.2.3


From 9dfc685e0262d4c5e44e13302f89841fa75173ca Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Oct 2021 14:30:14 -0700
Subject: inet: remove races in inet{6}_getname()

syzbot reported data-races in inet_getname() multiple times,
it is time we fix this instead of pretending applications
should not trigger them.

getsockname() and getpeername() are not really considered fast path.

v2: added the missing BPF_CGROUP_RUN_SA_PROG() declaration
    needed when CONFIG_CGROUP_BPF=n, as reported by
    kernel test robot <lkp@intel.com>

syzbot typical report:

BUG: KCSAN: data-race in __inet_hash_connect / inet_getname

write to 0xffff888136d66cf8 of 2 bytes by task 14374 on cpu 1:
 __inet_hash_connect+0x7ec/0x950 net/ipv4/inet_hashtables.c:831
 inet_hash_connect+0x85/0x90 net/ipv4/inet_hashtables.c:853
 tcp_v4_connect+0x782/0xbb0 net/ipv4/tcp_ipv4.c:275
 __inet_stream_connect+0x156/0x6e0 net/ipv4/af_inet.c:664
 inet_stream_connect+0x44/0x70 net/ipv4/af_inet.c:728
 __sys_connect_file net/socket.c:1896 [inline]
 __sys_connect+0x254/0x290 net/socket.c:1913
 __do_sys_connect net/socket.c:1923 [inline]
 __se_sys_connect net/socket.c:1920 [inline]
 __x64_sys_connect+0x3d/0x50 net/socket.c:1920
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x44/0xa0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

read to 0xffff888136d66cf8 of 2 bytes by task 14408 on cpu 0:
 inet_getname+0x11f/0x170 net/ipv4/af_inet.c:790
 __sys_getsockname+0x11d/0x1b0 net/socket.c:1946
 __do_sys_getsockname net/socket.c:1961 [inline]
 __se_sys_getsockname net/socket.c:1958 [inline]
 __x64_sys_getsockname+0x3e/0x50 net/socket.c:1958
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x44/0xa0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

value changed: 0x0000 -> 0xdee0

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 14408 Comm: syz-executor.3 Not tainted 5.15.0-rc3-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Link: https://lore.kernel.org/r/20211026213014.3026708-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/bpf-cgroup.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 2746fd804216..3536ab432b30 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -517,6 +517,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 
 #define cgroup_bpf_enabled(atype) (0)
 #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
-- 
cgit v1.2.3


From d4dccf353db80e209f262e3973c834e6e48ba9a9 Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Mon, 25 Oct 2021 08:21:09 -0400
Subject: Drivers: hv: vmbus: Mark vmbus ring buffer visible to host in
 Isolation VM

Mark vmbus ring buffer visible with set_memory_decrypted() when
establish gpadl handle.

Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Link: https://lore.kernel.org/r/20211025122116.264793-5-ltykernel@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/linux/hyperv.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index ddc8713ce57b..a9e0bc3b1511 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -803,6 +803,12 @@ struct vmbus_device {
 
 #define VMBUS_DEFAULT_MAX_PKT_SIZE 4096
 
+struct vmbus_gpadl {
+	u32 gpadl_handle;
+	u32 size;
+	void *buffer;
+};
+
 struct vmbus_channel {
 	struct list_head listentry;
 
@@ -822,7 +828,7 @@ struct vmbus_channel {
 	bool rescind_ref; /* got rescind msg, got channel reference */
 	struct completion rescind_event;
 
-	u32 ringbuffer_gpadlhandle;
+	struct vmbus_gpadl ringbuffer_gpadlhandle;
 
 	/* Allocated memory for ring buffer */
 	struct page *ringbuffer_page;
@@ -1192,10 +1198,10 @@ extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
 extern int vmbus_establish_gpadl(struct vmbus_channel *channel,
 				      void *kbuffer,
 				      u32 size,
-				      u32 *gpadl_handle);
+				      struct vmbus_gpadl *gpadl);
 
 extern int vmbus_teardown_gpadl(struct vmbus_channel *channel,
-				     u32 gpadl_handle);
+				     struct vmbus_gpadl *gpadl);
 
 void vmbus_reset_channel_cb(struct vmbus_channel *channel);
 
-- 
cgit v1.2.3


From 20cf6616ccd50256a14fb2a7a3cc730080c90cd0 Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Mon, 25 Oct 2021 12:54:34 -0700
Subject: Drivers: hv: vmbus: Remove unused code to check for subchannels

The last caller of vmbus_are_subchannels_present() was removed in commit
c967590457ca ("scsi: storvsc: Fix a race in sub-channel creation that can cause panic").

Remove this dead code, and the utility function invoke_sc_cb() that it is
the only caller of.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/1635191674-34407-1-git-send-email-mikelley@microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/linux/hyperv.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index a9e0bc3b1511..b823311eac79 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1106,19 +1106,6 @@ void vmbus_set_sc_create_callback(struct vmbus_channel *primary_channel,
 void vmbus_set_chn_rescind_callback(struct vmbus_channel *channel,
 		void (*chn_rescind_cb)(struct vmbus_channel *));
 
-/*
- * Check if sub-channels have already been offerred. This API will be useful
- * when the driver is unloaded after establishing sub-channels. In this case,
- * when the driver is re-loaded, the driver would have to check if the
- * subchannels have already been established before attempting to request
- * the creation of sub-channels.
- * This function returns TRUE to indicate that subchannels have already been
- * created.
- * This function should be invoked after setting the callback function for
- * sub-channel creation.
- */
-bool vmbus_are_subchannels_present(struct vmbus_channel *primary);
-
 /* The format must be the same as struct vmdata_gpa_direct */
 struct vmbus_channel_packet_page_buffer {
 	u16 type;
-- 
cgit v1.2.3


From 9330986c03006ab1d33d243b7cfe598a7a3c1baa Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannekoong@fb.com>
Date: Wed, 27 Oct 2021 16:45:00 -0700
Subject: bpf: Add bloom filter map implementation

This patch adds the kernel-side changes for the implementation of
a bpf bloom filter map.

The bloom filter map supports peek (determining whether an element
is present in the map) and push (adding an element to the map)
operations.These operations are exposed to userspace applications
through the already existing syscalls in the following way:

BPF_MAP_LOOKUP_ELEM -> peek
BPF_MAP_UPDATE_ELEM -> push

The bloom filter map does not have keys, only values. In light of
this, the bloom filter map's API matches that of queue stack maps:
user applications use BPF_MAP_LOOKUP_ELEM/BPF_MAP_UPDATE_ELEM
which correspond internally to bpf_map_peek_elem/bpf_map_push_elem,
and bpf programs must use the bpf_map_peek_elem and bpf_map_push_elem
APIs to query or add an element to the bloom filter map. When the
bloom filter map is created, it must be created with a key_size of 0.

For updates, the user will pass in the element to add to the map
as the value, with a NULL key. For lookups, the user will pass in the
element to query in the map as the value, with a NULL key. In the
verifier layer, this requires us to modify the argument type of
a bloom filter's BPF_FUNC_map_peek_elem call to ARG_PTR_TO_MAP_VALUE;
as well, in the syscall layer, we need to copy over the user value
so that in bpf_map_peek_elem, we know which specific value to query.

A few things to please take note of:
 * If there are any concurrent lookups + updates, the user is
responsible for synchronizing this to ensure no false negative lookups
occur.
 * The number of hashes to use for the bloom filter is configurable from
userspace. If no number is specified, the default used will be 5 hash
functions. The benchmarks later in this patchset can help compare the
performance of using different number of hashes on different entry
sizes. In general, using more hashes decreases both the false positive
rate and the speed of a lookup.
 * Deleting an element in the bloom filter map is not supported.
 * The bloom filter map may be used as an inner map.
 * The "max_entries" size that is specified at map creation time is used
to approximate a reasonable bitmap size for the bloom filter, and is not
otherwise strictly enforced. If the user wishes to insert more entries
into the bloom filter than "max_entries", they may do so but they should
be aware that this may lead to a higher false positive rate.

Signed-off-by: Joanne Koong <joannekoong@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211027234504.30744-2-joannekoong@fb.com
---
 include/linux/bpf.h       | 1 +
 include/linux/bpf_types.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 31421c74ba08..50105e0b8fcc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -169,6 +169,7 @@ struct bpf_map {
 	u32 value_size;
 	u32 max_entries;
 	u32 map_flags;
+	u64 map_extra; /* any per-map-type extra fields */
 	int spin_lock_off; /* >=0 valid offset, <0 error */
 	int timer_off; /* >=0 valid offset, <0 error */
 	u32 id;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 9c81724e4b98..c4424ac2fa02 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -125,6 +125,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
 
 BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
-- 
cgit v1.2.3


From d6aef08a872b9e23eecc92d0e92393473b13c497 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 28 Oct 2021 12:04:54 +0530
Subject: bpf: Add bpf_kallsyms_lookup_name helper

This helper allows us to get the address of a kernel symbol from inside
a BPF_PROG_TYPE_SYSCALL prog (used by gen_loader), so that we can
relocate typeless ksym vars.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20211028063501.2239335-2-memxor@gmail.com
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 50105e0b8fcc..6deebf8bf78f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2110,6 +2110,7 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
+extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
-- 
cgit v1.2.3


From 78476d315e190533757ab894255c4f2c2f254bce Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Fri, 29 Oct 2021 11:01:44 +0800
Subject: mctp: Add flow extension to skb

This change adds a new skb extension for MCTP, to represent a
request/response flow.

The intention is to use this in a later change to allow i2c controllers
to correctly configure a multiplexer over a flow.

Since we have a cleanup function in the core path (if an extension is
present), we'll need to make CONFIG_MCTP a bool, rather than a tristate.

Includes a fix for a build warning with clang:
Reported-by: kernel test robot <lkp@intel.com>

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cb96f1e6460c..0bd6520329f6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4243,6 +4243,9 @@ enum skb_ext_id {
 #endif
 #if IS_ENABLED(CONFIG_MPTCP)
 	SKB_EXT_MPTCP,
+#endif
+#if IS_ENABLED(CONFIG_MCTP_FLOWS)
+	SKB_EXT_MCTP,
 #endif
 	SKB_EXT_NUM, /* must be last */
 };
-- 
cgit v1.2.3


From 0bf6d96cb8294094ce1e44cbe8cf65b0899d0a3a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 25 Oct 2021 09:05:07 +0200
Subject: block: remove blk_{get,put}_request

These are now pointless wrappers around blk_mq_{alloc,free}_request,
so remove them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20211025070517.1548584-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ebc45cf0450b..8682663e7368 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -892,9 +892,6 @@ static inline bool rq_is_sync(struct request *rq)
 }
 
 void blk_rq_init(struct request_queue *q, struct request *rq);
-void blk_put_request(struct request *rq);
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
-		blk_mq_req_flags_t flags);
 int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 		struct bio_set *bs, gfp_t gfp_mask,
 		int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
-- 
cgit v1.2.3


From ee12203746e5cec678c7d126c9901548bfec1dd5 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Sat, 9 Oct 2021 09:44:39 -0700
Subject: PCI: Add pci_find_dvsec_capability to find designated VSEC

Add pci_find_dvsec_capability to locate a Designated Vendor-Specific
Extended Capability with the specified Vendor ID and Capability ID.

The Designated Vendor-Specific Extended Capability (DVSEC) allows one or
more "vendor" specific capabilities that are not tied to the Vendor ID
of the PCI component. Where the DVSEC Vendor may be a standards body
like CXL.

Cc: David E. Box <david.e.box@linux.intel.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: linux-pci@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Andrew Donnellan <ajd@linux.ibm.com>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/163379787943.692348.6814373487017444007.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index cd8aa6fce204..c93ccfa4571b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1130,6 +1130,7 @@ u16 pci_find_ext_capability(struct pci_dev *dev, int cap);
 u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 pos, int cap);
 struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
 u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap);
+u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec);
 
 u64 pci_get_dsn(struct pci_dev *dev);
 
-- 
cgit v1.2.3


From 26d5badbccddcc063dc5174a2baffd13a23322aa Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 20 Oct 2021 12:43:59 -0500
Subject: signal: Implement force_fatal_sig

Add a simple helper force_fatal_sig that causes a signal to be
delivered to a process as if the signal handler was set to SIG_DFL.

Reimplement force_sigsegv based upon this new helper.  This fixes
force_sigsegv so that when it forces the default signal handler
to be used the code now forces the signal to be unblocked as well.

Reusing the tested logic in force_sig_info_to_task that was built for
force_sig_seccomp this makes the implementation trivial.

This is interesting both because it makes force_sigsegv simpler and
because there are a couple of buggy places in the kernel that call
do_exit(SIGILL) or do_exit(SIGSYS) because there is no straight
forward way today for those places to simply force the exit of a
process with the chosen signal.  Creating force_fatal_sig allows
those places to be implemented with normal signal exits.

Link: https://lkml.kernel.org/r/20211020174406.17889-13-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/sched/signal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index e5f4ce622ee6..e2dc9f119ada 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -338,6 +338,7 @@ extern int kill_pid(struct pid *pid, int sig, int priv);
 extern __must_check bool do_notify_parent(struct task_struct *, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int);
+extern void force_fatal_sig(int);
 extern int send_sig(int, struct task_struct *, int);
 extern int zap_other_threads(struct task_struct *p);
 extern struct sigqueue *sigqueue_alloc(void);
-- 
cgit v1.2.3


From 5bf84b29938579a9350a4a9c2c4f8b5da2aa6992 Mon Sep 17 00:00:00 2001
From: Brett Creeley <brett.creeley@intel.com>
Date: Fri, 7 May 2021 15:09:10 -0700
Subject: virtchnl: Remove unused VIRTCHNL_VF_OFFLOAD_RSVD define

Remove unused define that is currently marked as reserved. This will
open up space for a new feature if/when it's introduced. Also, there is
no reason to keep unused defines around.

Suggested-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index db0e099c2399..2e1e1379b569 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -240,7 +240,6 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
  */
 #define VIRTCHNL_VF_OFFLOAD_L2			0x00000001
 #define VIRTCHNL_VF_OFFLOAD_IWARP		0x00000002
-#define VIRTCHNL_VF_OFFLOAD_RSVD		0x00000004
 #define VIRTCHNL_VF_OFFLOAD_RSS_AQ		0x00000008
 #define VIRTCHNL_VF_OFFLOAD_RSS_REG		0x00000010
 #define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR		0x00000020
-- 
cgit v1.2.3


From 4a15022f82ee0f2e14a7f953b7bbc0f5c983b663 Mon Sep 17 00:00:00 2001
From: Brett Creeley <brett.creeley@intel.com>
Date: Fri, 7 May 2021 15:09:11 -0700
Subject: virtchnl: Use the BIT() macro for capability/offload flags

Currently raw hex values are used to define specific bits for each
capability/offload in virtchnl.h. Using raw hex values makes it
unclear which bits are used/available. Fix this by using the BIT()
macro so it's immediately obvious which bits are used/available.

Also, move the VIRTCHNL_VF_CAP_ADV_LINK_SPEED define in the correct
place to line up with the other bit values and add a comment for its
purpose.

Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 2e1e1379b569..b30a1bc74fc7 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -238,26 +238,26 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
  * VIRTCHNL_VF_OFFLOAD_L2 flag is inclusive of base mode L2 offloads including
  * TX/RX Checksum offloading and TSO for non-tunnelled packets.
  */
-#define VIRTCHNL_VF_OFFLOAD_L2			0x00000001
-#define VIRTCHNL_VF_OFFLOAD_IWARP		0x00000002
-#define VIRTCHNL_VF_OFFLOAD_RSS_AQ		0x00000008
-#define VIRTCHNL_VF_OFFLOAD_RSS_REG		0x00000010
-#define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR		0x00000020
-#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		0x00000040
-#define VIRTCHNL_VF_OFFLOAD_VLAN		0x00010000
-#define VIRTCHNL_VF_OFFLOAD_RX_POLLING		0x00020000
-#define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2	0x00040000
-#define VIRTCHNL_VF_OFFLOAD_RSS_PF		0X00080000
-#define VIRTCHNL_VF_OFFLOAD_ENCAP		0X00100000
-#define VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM		0X00200000
-#define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM	0X00400000
-#define VIRTCHNL_VF_OFFLOAD_ADQ			0X00800000
-#define VIRTCHNL_VF_OFFLOAD_USO			0X02000000
-#define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF		0X08000000
-#define VIRTCHNL_VF_OFFLOAD_FDIR_PF		0X10000000
-
-/* Define below the capability flags that are not offloads */
-#define VIRTCHNL_VF_CAP_ADV_LINK_SPEED		0x00000080
+#define VIRTCHNL_VF_OFFLOAD_L2			BIT(0)
+#define VIRTCHNL_VF_OFFLOAD_IWARP		BIT(1)
+#define VIRTCHNL_VF_OFFLOAD_RSS_AQ		BIT(3)
+#define VIRTCHNL_VF_OFFLOAD_RSS_REG		BIT(4)
+#define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR		BIT(5)
+#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		BIT(6)
+/* used to negotiate communicating link speeds in Mbps */
+#define VIRTCHNL_VF_CAP_ADV_LINK_SPEED		BIT(7)
+#define VIRTCHNL_VF_OFFLOAD_VLAN		BIT(16)
+#define VIRTCHNL_VF_OFFLOAD_RX_POLLING		BIT(17)
+#define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2	BIT(18)
+#define VIRTCHNL_VF_OFFLOAD_RSS_PF		BIT(19)
+#define VIRTCHNL_VF_OFFLOAD_ENCAP		BIT(20)
+#define VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM		BIT(21)
+#define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM	BIT(22)
+#define VIRTCHNL_VF_OFFLOAD_ADQ			BIT(23)
+#define VIRTCHNL_VF_OFFLOAD_USO			BIT(25)
+#define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF		BIT(27)
+#define VIRTCHNL_VF_OFFLOAD_FDIR_PF		BIT(28)
+
 #define VF_BASE_MODE_OFFLOADS (VIRTCHNL_VF_OFFLOAD_L2 | \
 			       VIRTCHNL_VF_OFFLOAD_VLAN | \
 			       VIRTCHNL_VF_OFFLOAD_RSS_PF)
-- 
cgit v1.2.3


From 504e15724893a839213fad5eedfbd511d9ba75cc Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@nvidia.com>
Date: Sun, 11 Jul 2021 16:56:54 +0300
Subject: net/mlx5: Allow skipping counter refresh on creation

CT creates a counter for each CT rule, and for each such counter,
fs_counters tries to queue mlx5_fc_stats_work() work again via
mod_delayed_work(0) call to refresh all counters. This call has a
large performance impact when reaching high insertion rate and
accounts for ~8% of the insertion time when using software steering.

Allow skipping the refresh of all counters during counter creation.
Change CT to use this refresh skipping for it's counters.

Signed-off-by: Paul Blakey <paulb@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Oz Shlomo <ozsh@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/fs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index a7e1155bc4da..cd2d4c572367 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -245,6 +245,10 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
 				 struct mlx5_flow_destination *old_dest);
 
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
+
+/* As mlx5_fc_create() but doesn't queue stats refresh thread. */
+struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging);
+
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
-- 
cgit v1.2.3


From f89f9c56e7372b2dda144f83dce61311b298c559 Mon Sep 17 00:00:00 2001
From: Sven Peter <sven@svenpeter.dev>
Date: Mon, 25 Oct 2021 08:22:04 +0200
Subject: mailbox: apple: Add driver for Apple mailboxes

Apple SoCs such as the M1 come with various co-processors. Mailboxes
are used to communicate with those. This driver adds support for
two variants of those mailboxes.

Signed-off-by: Sven Peter <sven@svenpeter.dev>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 include/linux/apple-mailbox.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 include/linux/apple-mailbox.h

(limited to 'include/linux')

diff --git a/include/linux/apple-mailbox.h b/include/linux/apple-mailbox.h
new file mode 100644
index 000000000000..720fbb70294a
--- /dev/null
+++ b/include/linux/apple-mailbox.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR MIT */
+/*
+ * Apple mailbox message format
+ *
+ * Copyright (C) 2021 The Asahi Linux Contributors
+ */
+
+#ifndef _LINUX_APPLE_MAILBOX_H_
+#define _LINUX_APPLE_MAILBOX_H_
+
+#include <linux/types.h>
+
+/* encodes a single 96bit message sent over the single channel */
+struct apple_mbox_msg {
+	u64 msg0;
+	u32 msg1;
+};
+
+#endif
-- 
cgit v1.2.3


From 97961f78e8bc7f50ff7113fec030af6fa5f004d0 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Fri, 22 Oct 2021 18:18:56 +0800
Subject: mailbox: imx: support i.MX8ULP S4 MU

Like i.MX8 SCU, i.MX8ULP S4 also has vendor specific protocol.
 - bind SCU/S4 MU part to share one tx/rx/init API to make code simple.
 - S4 msg max size is very large, so alloc the space at driver probe,
   not use local on stack variable.
 - S4 MU has 8 TR and 4 RR which is different with i.MX8 MU, so adapt
   code to reflect this.

   Tested on i.MX8MP, i.MX8ULP

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 include/linux/firmware/imx/s4.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 include/linux/firmware/imx/s4.h

(limited to 'include/linux')

diff --git a/include/linux/firmware/imx/s4.h b/include/linux/firmware/imx/s4.h
new file mode 100644
index 000000000000..9e34923ae1d6
--- /dev/null
+++ b/include/linux/firmware/imx/s4.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright 2021 NXP
+ *
+ * Header file for the IPC implementation.
+ */
+
+#ifndef _S4_IPC_H
+#define _S4_IPC_H
+
+struct imx_s4_ipc;
+
+struct imx_s4_rpc_msg {
+	uint8_t ver;
+	uint8_t size;
+	uint8_t cmd;
+	uint8_t tag;
+} __packed;
+
+#endif /* _S4_IPC_H */
-- 
cgit v1.2.3


From cc8d7b4aea79df7cb45b74f9bc5b8a8bd2ed4c07 Mon Sep 17 00:00:00 2001
From: Anssi Hannula <anssi.hannula@bitwise.fi>
Date: Wed, 27 Oct 2021 13:21:24 +0300
Subject: tty: Fix extra "not" in TTY_DRIVER_REAL_RAW description

TTY_DRIVER_REAL_RAW flag (which is always set for e.g. serial ports)
documentation says that driver must always set special character
handling flags in certain conditions.

However, as the following sentence makes clear, what is actually
intended is the opposite.

Fix that by removing the unintended double negation.

Acked-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Anssi Hannula <anssi.hannula@bitwise.fi>
Link: https://lore.kernel.org/r/20211027102124.3049414-1-anssi.hannula@bitwise.fi
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 29e1cf178afb..795b94ccdeb6 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -360,7 +360,7 @@ static inline void tty_set_operations(struct tty_driver *driver,
  * 	Used for PTY's, in particular.
  * 
  * TTY_DRIVER_REAL_RAW --- if set, indicates that the driver will
- * 	guarantee never not to set any special character handling
+ * 	guarantee never to set any special character handling
  * 	flags if ((IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR ||
  * 	!INPCK)).  That is, if there is no reason for the driver to
  * 	send notifications of parity and break characters up to the
-- 
cgit v1.2.3


From f98a3dccfcb0b9b9c3bef8df9edd61cda80ad937 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 22 Oct 2021 13:59:38 +0200
Subject: locking: Remove spin_lock_flags() etc

parisc, ia64 and powerpc32 are the only remaining architectures that
provide custom arch_{spin,read,write}_lock_flags() functions, which are
meant to re-enable interrupts while waiting for a spinlock.

However, none of these can actually run into this codepath, because
it is only called on architectures without CONFIG_GENERIC_LOCKBREAK,
or when CONFIG_DEBUG_LOCK_ALLOC is set without CONFIG_LOCKDEP, and none
of those combinations are possible on the three architectures.

Going back in the git history, it appears that arch/mn10300 may have
been able to run into this code path, but there is a good chance that
it never worked. On the architectures that still exist, it was
already impossible to hit back in 2008 after the introduction of
CONFIG_GENERIC_LOCKBREAK, and possibly earlier.

As this is all dead code, just remove it and the helper functions built
around it. For arch/ia64, the inline asm could be cleaned up, but
it seems safer to leave it untouched.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Helge Deller <deller@gmx.de>  # parisc
Link: https://lore.kernel.org/r/20211022120058.1031690-1-arnd@kernel.org
---
 include/linux/lockdep.h          | 17 -----------------
 include/linux/rwlock.h           | 15 ---------------
 include/linux/rwlock_api_smp.h   |  6 ++----
 include/linux/spinlock.h         | 13 -------------
 include/linux/spinlock_api_smp.h |  9 ---------
 include/linux/spinlock_up.h      |  1 -
 6 files changed, 2 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 9fe165beb0f9..467b94257105 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -481,23 +481,6 @@ do {								\
 
 #endif /* CONFIG_LOCK_STAT */
 
-#ifdef CONFIG_LOCKDEP
-
-/*
- * On lockdep we dont want the hand-coded irq-enable of
- * _raw_*_lock_flags() code, because lockdep assumes
- * that interrupts are not re-enabled during lock-acquire:
- */
-#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
-	LOCK_CONTENDED((_lock), (try), (lock))
-
-#else /* CONFIG_LOCKDEP */
-
-#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
-	lockfl((_lock), (flags))
-
-#endif /* CONFIG_LOCKDEP */
-
 #ifdef CONFIG_PROVE_LOCKING
 extern void print_irqtrace_events(struct task_struct *curr);
 #else
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index 7ce9a51ae5c0..2c0ad417ce3c 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -30,31 +30,16 @@ do {								\
 
 #ifdef CONFIG_DEBUG_SPINLOCK
  extern void do_raw_read_lock(rwlock_t *lock) __acquires(lock);
-#define do_raw_read_lock_flags(lock, flags) do_raw_read_lock(lock)
  extern int do_raw_read_trylock(rwlock_t *lock);
  extern void do_raw_read_unlock(rwlock_t *lock) __releases(lock);
  extern void do_raw_write_lock(rwlock_t *lock) __acquires(lock);
-#define do_raw_write_lock_flags(lock, flags) do_raw_write_lock(lock)
  extern int do_raw_write_trylock(rwlock_t *lock);
  extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock);
 #else
-
-#ifndef arch_read_lock_flags
-# define arch_read_lock_flags(lock, flags)	arch_read_lock(lock)
-#endif
-
-#ifndef arch_write_lock_flags
-# define arch_write_lock_flags(lock, flags)	arch_write_lock(lock)
-#endif
-
 # define do_raw_read_lock(rwlock)	do {__acquire(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0)
-# define do_raw_read_lock_flags(lock, flags) \
-		do {__acquire(lock); arch_read_lock_flags(&(lock)->raw_lock, *(flags)); } while (0)
 # define do_raw_read_trylock(rwlock)	arch_read_trylock(&(rwlock)->raw_lock)
 # define do_raw_read_unlock(rwlock)	do {arch_read_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
 # define do_raw_write_lock(rwlock)	do {__acquire(lock); arch_write_lock(&(rwlock)->raw_lock); } while (0)
-# define do_raw_write_lock_flags(lock, flags) \
-		do {__acquire(lock); arch_write_lock_flags(&(lock)->raw_lock, *(flags)); } while (0)
 # define do_raw_write_trylock(rwlock)	arch_write_trylock(&(rwlock)->raw_lock)
 # define do_raw_write_unlock(rwlock)	do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
 #endif
diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
index abfb53ab11be..f1db6f17c4fb 100644
--- a/include/linux/rwlock_api_smp.h
+++ b/include/linux/rwlock_api_smp.h
@@ -157,8 +157,7 @@ static inline unsigned long __raw_read_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED_FLAGS(lock, do_raw_read_trylock, do_raw_read_lock,
-			     do_raw_read_lock_flags, &flags);
+	LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock);
 	return flags;
 }
 
@@ -184,8 +183,7 @@ static inline unsigned long __raw_write_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED_FLAGS(lock, do_raw_write_trylock, do_raw_write_lock,
-			     do_raw_write_lock_flags, &flags);
+	LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
 	return flags;
 }
 
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 45310ea1b1d7..f0447062eecd 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -177,7 +177,6 @@ do {									\
 
 #ifdef CONFIG_DEBUG_SPINLOCK
  extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
-#define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock)
  extern int do_raw_spin_trylock(raw_spinlock_t *lock);
  extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock);
 #else
@@ -188,18 +187,6 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
 	mmiowb_spin_lock();
 }
 
-#ifndef arch_spin_lock_flags
-#define arch_spin_lock_flags(lock, flags)	arch_spin_lock(lock)
-#endif
-
-static inline void
-do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock)
-{
-	__acquire(lock);
-	arch_spin_lock_flags(&lock->raw_lock, *flags);
-	mmiowb_spin_lock();
-}
-
 static inline int do_raw_spin_trylock(raw_spinlock_t *lock)
 {
 	int ret = arch_spin_trylock(&(lock)->raw_lock);
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 6b8e1a0b137b..51fa0dab68c4 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -108,16 +108,7 @@ static inline unsigned long __raw_spin_lock_irqsave(raw_spinlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	/*
-	 * On lockdep we dont want the hand-coded irq-enable of
-	 * do_raw_spin_lock_flags() code, because lockdep assumes
-	 * that interrupts are not re-enabled during lock-acquire:
-	 */
-#ifdef CONFIG_LOCKDEP
 	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
-#else
-	do_raw_spin_lock_flags(lock, &flags);
-#endif
 	return flags;
 }
 
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 0ac9112c1bbe..16521074b6f7 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -62,7 +62,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 #define arch_spin_is_locked(lock)	((void)(lock), 0)
 /* for sched/core.c and kernel_lock.c: */
 # define arch_spin_lock(lock)		do { barrier(); (void)(lock); } while (0)
-# define arch_spin_lock_flags(lock, flags)	do { barrier(); (void)(lock); } while (0)
 # define arch_spin_unlock(lock)	do { barrier(); (void)(lock); } while (0)
 # define arch_spin_trylock(lock)	({ barrier(); (void)(lock); 1; })
 #endif /* DEBUG_SPINLOCK */
-- 
cgit v1.2.3


From 9cc2fa4f4a92ccc6760d764e7341be46ee8aaaa1 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 5 Oct 2021 00:05:43 +0200
Subject: task_stack: Fix end_of_stack() for architectures with upwards-growing
 stack

The function end_of_stack() returns a pointer to the last entry of a
stack. For architectures like parisc where the stack grows upwards
return the pointer to the highest address in the stack.

Without this change I faced a crash on parisc, because the stackleak
functionality wrote STACKLEAK_POISON to the lowest address and thus
overwrote the first 4 bytes of the task_struct which included the
TIF_FLAGS.

Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/linux/sched/task_stack.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index 2413427e439c..d10150587d81 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -25,7 +25,11 @@ static inline void *task_stack_page(const struct task_struct *task)
 
 static inline unsigned long *end_of_stack(const struct task_struct *task)
 {
+#ifdef CONFIG_STACK_GROWSUP
+	return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1;
+#else
 	return task->stack;
+#endif
 }
 
 #elif !defined(__HAVE_THREAD_FUNCTIONS)
-- 
cgit v1.2.3


From e60b56e46b384cee1ad34e6adc164d883049c6c3 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 19 Oct 2021 14:35:35 +0200
Subject: sched/fair: Wait before decaying max_newidle_lb_cost

Decay max_newidle_lb_cost only when it has not been updated for a while
and ensure to not decay a recently changed value.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20211019123537.17146-4-vincent.guittot@linaro.org
---
 include/linux/sched/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 2f9166f6dec8..c07bfa2d80f2 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -105,7 +105,7 @@ struct sched_domain {
 
 	/* idle_balance() stats */
 	u64 max_newidle_lb_cost;
-	unsigned long next_decay_max_lb_cost;
+	unsigned long last_decay_max_lb_cost;
 
 	u64 avg_scan_cost;		/* select_idle_sibling */
 
-- 
cgit v1.2.3


From 7ff22787ba49c2e66dcec92f3e2b79ef6b6a0d71 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Mon, 4 Oct 2021 10:07:09 -0700
Subject: platform/chrome: cros_ec_proto: Use EC struct for features

The Chrome EC's features are returned through an
ec_response_get_features struct, but they are stored in an independent
array. Although the two are effectively the same at present (2 unsigned
32 bit ints), there is the possibility that they could go out of sync.
Avoid this by only using the EC struct to store the features.

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Link: https://lore.kernel.org/r/20211004170716.86601-1-pmalani@chromium.org
Signed-off-by: Benson Leung <bleung@chromium.org>
---
 include/linux/platform_data/cros_ec_proto.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 9d370816a419..df3c78c92ca2 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -205,7 +205,7 @@ struct cros_ec_dev {
 	struct cros_ec_debugfs *debug_info;
 	bool has_kb_wake_angle;
 	u16 cmd_offset;
-	u32 features[2];
+	struct ec_response_get_features features;
 };
 
 #define to_cros_ec_dev(dev)  container_of(dev, struct cros_ec_dev, class_dev)
-- 
cgit v1.2.3


From d89c8169bd7052c78731137da4c4c06986409c62 Mon Sep 17 00:00:00 2001
From: Wu Zongyong <wuzongyong@linux.alibaba.com>
Date: Fri, 29 Oct 2021 17:14:42 +0800
Subject: virtio-pci: introduce legacy device module

Split common codes from virtio-pci-legacy so vDPA driver can reuse it
later.

Signed-off-by: Wu Zongyong <wuzongyong@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/71605acde5e97fcb2760a6973e406279fb1bbd33.1635493219.git.wuzongyong@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_pci_legacy.h | 42 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 include/linux/virtio_pci_legacy.h

(limited to 'include/linux')

diff --git a/include/linux/virtio_pci_legacy.h b/include/linux/virtio_pci_legacy.h
new file mode 100644
index 000000000000..e5d665faf00e
--- /dev/null
+++ b/include/linux/virtio_pci_legacy.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VIRTIO_PCI_LEGACY_H
+#define _LINUX_VIRTIO_PCI_LEGACY_H
+
+#include "linux/mod_devicetable.h"
+#include <linux/pci.h>
+#include <linux/virtio_pci.h>
+
+struct virtio_pci_legacy_device {
+	struct pci_dev *pci_dev;
+
+	/* Where to read and clear interrupt */
+	u8 __iomem *isr;
+	/* The IO mapping for the PCI config space (legacy mode only) */
+	void __iomem *ioaddr;
+
+	struct virtio_device_id id;
+};
+
+u64 vp_legacy_get_features(struct virtio_pci_legacy_device *ldev);
+u64 vp_legacy_get_driver_features(struct virtio_pci_legacy_device *ldev);
+void vp_legacy_set_features(struct virtio_pci_legacy_device *ldev,
+			u32 features);
+u8 vp_legacy_get_status(struct virtio_pci_legacy_device *ldev);
+void vp_legacy_set_status(struct virtio_pci_legacy_device *ldev,
+			u8 status);
+u16 vp_legacy_queue_vector(struct virtio_pci_legacy_device *ldev,
+			   u16 idx, u16 vector);
+u16 vp_legacy_config_vector(struct virtio_pci_legacy_device *ldev,
+		     u16 vector);
+void vp_legacy_set_queue_address(struct virtio_pci_legacy_device *ldev,
+			     u16 index, u32 queue_pfn);
+bool vp_legacy_get_queue_enable(struct virtio_pci_legacy_device *ldev,
+				u16 idx);
+void vp_legacy_set_queue_size(struct virtio_pci_legacy_device *ldev,
+			      u16 idx, u16 size);
+u16 vp_legacy_get_queue_size(struct virtio_pci_legacy_device *ldev,
+			     u16 idx);
+int vp_legacy_probe(struct virtio_pci_legacy_device *ldev);
+void vp_legacy_remove(struct virtio_pci_legacy_device *ldev);
+
+#endif
-- 
cgit v1.2.3


From d0ae1fbfcff48e889bf993ba16890e30f6615593 Mon Sep 17 00:00:00 2001
From: Wu Zongyong <wuzongyong@linux.alibaba.com>
Date: Fri, 29 Oct 2021 17:14:43 +0800
Subject: vdpa: fix typo

Signed-off-by: Wu Zongyong <wuzongyong@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/4b5153262e4ba64986bb567d7425ad4829ca7bcc.1635493219.git.wuzongyong@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 3972ab765de1..a896ee021e5f 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -257,7 +257,7 @@ struct vdpa_config_ops {
 	struct vdpa_notification_area
 	(*get_vq_notification)(struct vdpa_device *vdev, u16 idx);
 	/* vq irq is not expected to be changed once DRIVER_OK is set */
-	int (*get_vq_irq)(struct vdpa_device *vdv, u16 idx);
+	int (*get_vq_irq)(struct vdpa_device *vdev, u16 idx);
 
 	/* Device ops */
 	u32 (*get_vq_align)(struct vdpa_device *vdev);
-- 
cgit v1.2.3


From 3b970a5842c9114c82e60744c84a7d06ee51b6f9 Mon Sep 17 00:00:00 2001
From: Wu Zongyong <wuzongyong@linux.alibaba.com>
Date: Fri, 29 Oct 2021 17:14:45 +0800
Subject: vdpa: add new callback get_vq_num_min in vdpa_config_ops

This callback is optional. For vdpa devices that not support to change
virtqueue size, get_vq_num_min and get_vq_num_max will return the same
value, so that users can choose a correct value for that device.

Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Wu Zongyong <wuzongyong@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/f4af5b0abd660d9a29ab6b2f67bd6df10284a230.1635493219.git.wuzongyong@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index a896ee021e5f..30864848950b 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -171,6 +171,9 @@ struct vdpa_map_file {
  * @get_vq_num_max:		Get the max size of virtqueue
  *				@vdev: vdpa device
  *				Returns u16: max size of virtqueue
+ * @get_vq_num_min:		Get the min size of virtqueue (optional)
+ *				@vdev: vdpa device
+ *				Returns u16: min size of virtqueue
  * @get_device_id:		Get virtio device id
  *				@vdev: vdpa device
  *				Returns u32: virtio device id
@@ -266,6 +269,7 @@ struct vdpa_config_ops {
 	void (*set_config_cb)(struct vdpa_device *vdev,
 			      struct vdpa_callback *cb);
 	u16 (*get_vq_num_max)(struct vdpa_device *vdev);
+	u16 (*get_vq_num_min)(struct vdpa_device *vdev);
 	u32 (*get_device_id)(struct vdpa_device *vdev);
 	u32 (*get_vendor_id)(struct vdpa_device *vdev);
 	u8 (*get_status)(struct vdpa_device *vdev);
-- 
cgit v1.2.3


From d50497eb4e554e1f0351e1836ee7241c059592e6 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 19 Oct 2021 15:01:45 +0800
Subject: virtio_config: introduce a new .enable_cbs method

This patch introduces a new method to enable the callbacks for config
and virtqueues. This will be used for making sure the virtqueue
callbacks are only enabled after virtio_device_ready() if transport
implements this method.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211019070152.8236-4-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 8519b3ae5d52..4d107ad31149 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -23,6 +23,8 @@ struct virtio_shm_region {
  *       any of @get/@set, @get_status/@set_status, or @get_features/
  *       @finalize_features are NOT safe to be called from an atomic
  *       context.
+ * @enable_cbs: enable the callbacks
+ *      vdev: the virtio_device
  * @get: read the value of a configuration field
  *	vdev: the virtio_device
  *	offset: the offset of the configuration field
@@ -75,6 +77,7 @@ struct virtio_shm_region {
  */
 typedef void vq_callback_t(struct virtqueue *);
 struct virtio_config_ops {
+	void (*enable_cbs)(struct virtio_device *vdev);
 	void (*get)(struct virtio_device *vdev, unsigned offset,
 		    void *buf, unsigned len);
 	void (*set)(struct virtio_device *vdev, unsigned offset,
@@ -229,6 +232,9 @@ void virtio_device_ready(struct virtio_device *dev)
 {
 	unsigned status = dev->config->get_status(dev);
 
+	if (dev->config->enable_cbs)
+                  dev->config->enable_cbs(dev);
+
 	BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
 	dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
 }
-- 
cgit v1.2.3


From 939779f5152d161b34f612af29e7dc1ac4472fcf Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 27 Oct 2021 10:21:04 +0800
Subject: virtio_ring: validate used buffer length

This patch validate the used buffer length provided by the device
before trying to use it. This is done by record the in buffer length
in a new field in desc_state structure during virtqueue_add(), then we
can fail the virtqueue_get_buf() when we find the device is trying to
give us a used buffer length which is greater than the in buffer
length.

Since some drivers have already done the validation by themselves,
this patch tries to makes the core validation optional. For the driver
that doesn't want the validation, it can set the
suppress_used_validation to be true (which could be overridden by
force_used_validation module parameter). To be more efficient, a
dedicate array is used for storing the validate used length, this
helps to eliminate the cache stress if validation is done by the
driver.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211027022107.14357-2-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 41edbc01ffa4..44d0e09da2d9 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -152,6 +152,7 @@ size_t virtio_max_dma_size(struct virtio_device *vdev);
  * @feature_table_size: number of entries in the feature table array.
  * @feature_table_legacy: same as feature_table but when working in legacy mode.
  * @feature_table_size_legacy: number of entries in feature table legacy array.
+ * @suppress_used_validation: set to not have core validate used length
  * @probe: the function to call when a device is found.  Returns 0 or -errno.
  * @scan: optional function to call after successful probe; intended
  *    for virtio-scsi to invoke a scan.
@@ -168,6 +169,7 @@ struct virtio_driver {
 	unsigned int feature_table_size;
 	const unsigned int *feature_table_legacy;
 	unsigned int feature_table_size_legacy;
+	bool suppress_used_validation;
 	int (*validate)(struct virtio_device *dev);
 	int (*probe)(struct virtio_device *dev);
 	void (*scan)(struct virtio_device *dev);
-- 
cgit v1.2.3


From 6dbb1f1687a2ccdfc5b84b0a35bbc6dfefc4de3b Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:12 +0300
Subject: vdpa: Introduce and use vdpa device get, set config helpers

Subsequent patches enable get and set configuration either
via management device or via vdpa device' config ops.

This requires synchronization between multiple callers to get and set
config callbacks. Features setting also influence the layout of the
configuration fields endianness.

To avoid exposing synchronization primitives to callers, introduce
helper for setting the configuration and use it.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20211026175519.87795-2-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 30864848950b..267236aab34c 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -386,21 +386,10 @@ static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features)
 	return ops->set_features(vdev, features);
 }
 
-static inline void vdpa_get_config(struct vdpa_device *vdev,
-				   unsigned int offset, void *buf,
-				   unsigned int len)
-{
-	const struct vdpa_config_ops *ops = vdev->config;
-
-	/*
-	 * Config accesses aren't supposed to trigger before features are set.
-	 * If it does happen we assume a legacy guest.
-	 */
-	if (!vdev->features_valid)
-		vdpa_set_features(vdev, 0);
-	ops->get_config(vdev, offset, buf, len);
-}
-
+void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
+		     void *buf, unsigned int len);
+void vdpa_set_config(struct vdpa_device *dev, unsigned int offset,
+		     const void *buf, unsigned int length);
 /**
  * struct vdpa_mgmtdev_ops - vdpa device ops
  * @dev_add: Add a vdpa device using alloc and register
-- 
cgit v1.2.3


From ad69dd0bf26b88ec6ab26f8bbe5cd74fbed7672a Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:13 +0300
Subject: vdpa: Introduce query of device config layout

Introduce a command to query a device config layout.

An example query of network vdpa device:

$ vdpa dev add name bar mgmtdev vdpasim_net

$ vdpa dev config show
bar: mac 00:35:09:19:48:05 link up link_announce false mtu 1500

$ vdpa dev config show -jp
{
    "config": {
        "bar": {
            "mac": "00:35:09:19:48:05",
            "link ": "up",
            "link_announce ": false,
            "mtu": 1500,
        }
    }
}

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211026175519.87795-3-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 267236aab34c..5cc5e501397f 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -63,6 +63,7 @@ struct vdpa_mgmt_dev;
  * @dev: underlying device
  * @dma_dev: the actual device that is performing DMA
  * @config: the configuration ops for this device.
+ * @cf_mutex: Protects get and set access to configuration layout.
  * @index: device index
  * @features_valid: were features initialized? for legacy guests
  * @use_va: indicate whether virtual address must be used by this device
@@ -74,6 +75,7 @@ struct vdpa_device {
 	struct device dev;
 	struct device *dma_dev;
 	const struct vdpa_config_ops *config;
+	struct mutex cf_mutex; /* Protects get/set config */
 	unsigned int index;
 	bool features_valid;
 	bool use_va;
-- 
cgit v1.2.3


From 960deb33be3d08e55a39e40e0286a51c7448e053 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:14 +0300
Subject: vdpa: Use kernel coding style for structure comments

As subsequent patch adds new structure field with comment, move the
structure comment to follow kernel coding style.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20211026175519.87795-4-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 5cc5e501397f..fafb7202482c 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -411,10 +411,17 @@ struct vdpa_mgmtdev_ops {
 	void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev);
 };
 
+/**
+ * struct vdpa_mgmt_dev - vdpa management device
+ * @device: Management parent device
+ * @ops: operations supported by management device
+ * @id_table: Pointer to device id table of supported ids
+ * @list: list entry
+ */
 struct vdpa_mgmt_dev {
 	struct device *device;
 	const struct vdpa_mgmtdev_ops *ops;
-	const struct virtio_device_id *id_table; /* supported ids */
+	const struct virtio_device_id *id_table;
 	struct list_head list;
 };
 
-- 
cgit v1.2.3


From d8ca2fa5be1bdb9d08cfe1f831cddb622a01dfd4 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:15 +0300
Subject: vdpa: Enable user to set mac and mtu of vdpa device

$ vdpa dev add name bar mgmtdev vdpasim_net mac 00:11:22:33:44:55 mtu 9000

$ vdpa dev config show
bar: mac 00:11:22:33:44:55 link up link_announce false mtu 9000

$ vdpa dev config show -jp
{
    "config": {
        "bar": {
            "mac": "00:11:22:33:44:55",
            "link ": "up",
            "link_announce ": false,
            "mtu": 9000,
        }
    }
}

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>

Link: https://lore.kernel.org/r/20211026175519.87795-5-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
---
 include/linux/vdpa.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index fafb7202482c..c3011ccda430 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -6,6 +6,8 @@
 #include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/vhost_iotlb.h>
+#include <linux/virtio_net.h>
+#include <linux/if_ether.h>
 
 /**
  * struct vdpa_calllback - vDPA callback definition.
@@ -93,6 +95,14 @@ struct vdpa_iova_range {
 	u64 last;
 };
 
+struct vdpa_dev_set_config {
+	struct {
+		u8 mac[ETH_ALEN];
+		u16 mtu;
+	} net;
+	u64 mask;
+};
+
 /**
  * Corresponding file area for device memory mapping
  * @file: vma->vm_file for the mapping
@@ -397,6 +407,7 @@ void vdpa_set_config(struct vdpa_device *dev, unsigned int offset,
  * @dev_add: Add a vdpa device using alloc and register
  *	     @mdev: parent device to use for device addition
  *	     @name: name of the new vdpa device
+ *	     @config: config attributes to apply to the device under creation
  *	     Driver need to add a new device using _vdpa_register_device()
  *	     after fully initializing the vdpa device. Driver must return 0
  *	     on success or appropriate error code.
@@ -407,7 +418,8 @@ void vdpa_set_config(struct vdpa_device *dev, unsigned int offset,
  *	     _vdpa_unregister_device().
  */
 struct vdpa_mgmtdev_ops {
-	int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name);
+	int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name,
+		       const struct vdpa_dev_set_config *config);
 	void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev);
 };
 
@@ -416,12 +428,15 @@ struct vdpa_mgmtdev_ops {
  * @device: Management parent device
  * @ops: operations supported by management device
  * @id_table: Pointer to device id table of supported ids
+ * @config_attr_mask: bit mask of attributes of type enum vdpa_attr that
+ *		      management device support during dev_add callback
  * @list: list entry
  */
 struct vdpa_mgmt_dev {
 	struct device *device;
 	const struct vdpa_mgmtdev_ops *ops;
 	const struct virtio_device_id *id_table;
+	u64 config_attr_mask;
 	struct list_head list;
 };
 
-- 
cgit v1.2.3


From 7303524e04af49a47991e19f895c3b8cdc3796c7 Mon Sep 17 00:00:00 2001
From: Liu Jian <liujian56@huawei.com>
Date: Fri, 29 Oct 2021 22:12:14 +0800
Subject: skmsg: Lose offset info in sk_psock_skb_ingress

If sockmap enable strparser, there are lose offset info in
sk_psock_skb_ingress(). If the length determined by parse_msg function is not
skb->len, the skb will be converted to sk_msg multiple times, and userspace
app will get the data multiple times.

Fix this by get the offset and length from strp_msg. And as Cong suggested,
add one bit in skb->_sk_redir to distinguish enable or disable strparser.

Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: Liu Jian <liujian56@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Cong Wang <cong.wang@bytedance.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20211029141216.211899-1-liujian56@huawei.com
---
 include/linux/skmsg.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 14ab0c0bc924..94e2a1f6e58d 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -508,8 +508,22 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 
 #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
 
-/* We only have one bit so far. */
-#define BPF_F_PTR_MASK ~(BPF_F_INGRESS)
+#define BPF_F_STRPARSER	(1UL << 1)
+
+/* We only have two bits so far. */
+#define BPF_F_PTR_MASK ~(BPF_F_INGRESS | BPF_F_STRPARSER)
+
+static inline bool skb_bpf_strparser(const struct sk_buff *skb)
+{
+	unsigned long sk_redir = skb->_sk_redir;
+
+	return sk_redir & BPF_F_STRPARSER;
+}
+
+static inline void skb_bpf_set_strparser(struct sk_buff *skb)
+{
+	skb->_sk_redir |= BPF_F_STRPARSER;
+}
 
 static inline bool skb_bpf_ingress(const struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 588e5d8766486e52ee332a4bb097b016a355b465 Mon Sep 17 00:00:00 2001
From: He Fengqing <hefengqing@huawei.com>
Date: Fri, 29 Oct 2021 02:39:06 +0000
Subject: cgroup: bpf: Move wrapper for __cgroup_bpf_*() to kernel/bpf/cgroup.c

In commit 324bda9e6c5a("bpf: multi program support for cgroup+bpf")
cgroup_bpf_*() called from kernel/bpf/syscall.c, but now they are only
used in kernel/bpf/cgroup.c, so move these function to
kernel/bpf/cgroup.c, like cgroup_bpf_replace().

Signed-off-by: He Fengqing <hefengqing@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/bpf-cgroup.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 2746fd804216..9aad4e3ca29b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -157,26 +157,6 @@ struct cgroup_bpf {
 int cgroup_bpf_inherit(struct cgroup *cgrp);
 void cgroup_bpf_offline(struct cgroup *cgrp);
 
-int __cgroup_bpf_attach(struct cgroup *cgrp,
-			struct bpf_prog *prog, struct bpf_prog *replace_prog,
-			struct bpf_cgroup_link *link,
-			enum bpf_attach_type type, u32 flags);
-int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-			struct bpf_cgroup_link *link,
-			enum bpf_attach_type type);
-int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
-		       union bpf_attr __user *uattr);
-
-/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
-int cgroup_bpf_attach(struct cgroup *cgrp,
-		      struct bpf_prog *prog, struct bpf_prog *replace_prog,
-		      struct bpf_cgroup_link *link, enum bpf_attach_type type,
-		      u32 flags);
-int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type);
-int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
-		     union bpf_attr __user *uattr);
-
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum cgroup_bpf_attach_type atype);
-- 
cgit v1.2.3


From ebe4560ed5c8cbfe3759f16c23ca5a6df090c6b5 Mon Sep 17 00:00:00 2001
From: Oscar Carter <oscar.carter@gmx.com>
Date: Sat, 30 May 2020 11:08:39 +0200
Subject: firewire: Remove function callback casts

In 1394 OHCI specification, Isochronous Receive DMA context has several
modes. One of mode is 'BufferFill' and Linux FireWire stack uses it to
receive isochronous packets for multiple isochronous channel as
FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL.

The mode is not used by in-kernel driver, while it's available for
userspace. The character device driver in firewire-core includes
cast of function callback for the mode since the type of callback
function is different from the other modes. The case is inconvenient
to effort of Control Flow Integrity builds due to
-Wcast-function-type warning.

This commit removes the cast. A static helper function is newly added
to initialize isochronous context for the mode. The helper function
arranges isochronous context to assign specific callback function
after call of existent kernel API. It's noticeable that the number of
isochronous channel, speed, and the size of header are not required for
the mode. The helper function is used for the mode by character device
driver instead of direct call of existent kernel API.

The same goal can be achieved (in the ioctl_create_iso_context function)
without this helper function as follows:
- Call the fw_iso_context_create function passing NULL to the callback
  parameter.
- Then setting the context->callback.sc or context->callback.mc
  variables based on the a->type value.

However using the helper function created in this patch makes code more
clear and declarative. This way avoid the call to a function with one
purpose to achieved another one.

Co-developed-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Co-developed-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Oscar Carter <oscar.carter@gmx.com>
Reviewed-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Testeb-by: Takashi Sakamoto<o-takashi@sakamocchi.jp>
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 include/linux/firewire.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index aec8f30ab200..07967a450eaa 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -436,6 +436,12 @@ typedef void (*fw_iso_callback_t)(struct fw_iso_context *context,
 				  void *header, void *data);
 typedef void (*fw_iso_mc_callback_t)(struct fw_iso_context *context,
 				     dma_addr_t completed, void *data);
+
+union fw_iso_callback {
+	fw_iso_callback_t sc;
+	fw_iso_mc_callback_t mc;
+};
+
 struct fw_iso_context {
 	struct fw_card *card;
 	int type;
@@ -443,10 +449,7 @@ struct fw_iso_context {
 	int speed;
 	bool drop_overflow_headers;
 	size_t header_size;
-	union {
-		fw_iso_callback_t sc;
-		fw_iso_mc_callback_t mc;
-	} callback;
+	union fw_iso_callback callback;
 	void *callback_data;
 };
 
-- 
cgit v1.2.3


From 74128d801b5165650ae6222e8cf23780fbc55e67 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 23 Sep 2021 01:09:45 +0200
Subject: watchdog: ux500_wdt: Drop platform data

Drop the platform data passing from the PRCMU driver. This platform
data was part of the ambition to support more SoCs, which in turn
were never mass produced.

Only a name remains of the MFD cell so switch to MFD_CELL_NAME().

Cc: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20210922230947.1864357-1-linus.walleij@linaro.org
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
---
 include/linux/platform_data/ux500_wdt.h | 18 ------------------
 1 file changed, 18 deletions(-)
 delete mode 100644 include/linux/platform_data/ux500_wdt.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/ux500_wdt.h b/include/linux/platform_data/ux500_wdt.h
deleted file mode 100644
index de6a4ad41e76..000000000000
--- a/include/linux/platform_data/ux500_wdt.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST Ericsson SA 2011
- *
- * STE Ux500 Watchdog platform data
- */
-#ifndef __UX500_WDT_H
-#define __UX500_WDT_H
-
-/**
- * struct ux500_wdt_data
- */
-struct ux500_wdt_data {
-	unsigned int timeout;
-	bool has_28_bits_resolution;
-};
-
-#endif /* __UX500_WDT_H */
-- 
cgit v1.2.3


From 31a645aea4f8da5bb190ce322c6e5aacaef13855 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 25 Oct 2021 14:40:22 +0800
Subject: bpf: Factor out a helper to prepare trampoline for struct_ops prog

Factor out a helper bpf_struct_ops_prepare_trampoline() to prepare
trampoline for BPF_PROG_TYPE_STRUCT_OPS prog. It will be used by
.test_run callback in following patch.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20211025064025.2567443-2-houtao1@huawei.com
---
 include/linux/bpf.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6deebf8bf78f..aabd3540aaaf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1000,6 +1000,10 @@ bool bpf_struct_ops_get(const void *kdata);
 void bpf_struct_ops_put(const void *kdata);
 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 				       void *value);
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs,
+				      struct bpf_prog *prog,
+				      const struct btf_func_model *model,
+				      void *image, void *image_end);
 static inline bool bpf_try_module_get(const void *data, struct module *owner)
 {
 	if (owner == BPF_MODULE_OWNER)
-- 
cgit v1.2.3


From 35346ab64132d0f5919b06932d708c0d10360553 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 25 Oct 2021 14:40:23 +0800
Subject: bpf: Factor out helpers for ctx access checking

Factor out two helpers to check the read access of ctx for raw tp
and BTF function. bpf_tracing_ctx_access() is used to check
the read access to argument is valid, and bpf_tracing_btf_ctx_access()
checks whether the btf type of argument is valid besides the checking
of argument read. bpf_tracing_btf_ctx_access() will be used by the
following patch.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20211025064025.2567443-3-houtao1@huawei.com
---
 include/linux/bpf.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index aabd3540aaaf..67f71e7def56 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1650,6 +1650,29 @@ bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner);
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info);
+
+static inline bool bpf_tracing_ctx_access(int off, int size,
+					  enum bpf_access_type type)
+{
+	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
+		return false;
+	if (type != BPF_READ)
+		return false;
+	if (off % size != 0)
+		return false;
+	return true;
+}
+
+static inline bool bpf_tracing_btf_ctx_access(int off, int size,
+					      enum bpf_access_type type,
+					      const struct bpf_prog *prog,
+					      struct bpf_insn_access_aux *info)
+{
+	if (!bpf_tracing_ctx_access(off, size, type))
+		return false;
+	return btf_ctx_access(off, size, type, prog, info);
+}
+
 int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
 		      const struct btf_type *t, int off, int size,
 		      enum bpf_access_type atype,
-- 
cgit v1.2.3


From c196906d50e360d82ed9aa5596a9d0ce89b7ab78 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 25 Oct 2021 14:40:24 +0800
Subject: bpf: Add dummy BPF STRUCT_OPS for test purpose

Currently the test of BPF STRUCT_OPS depends on the specific bpf
implementation of tcp_congestion_ops, but it can not cover all
basic functionalities (e.g, return value handling), so introduce
a dummy BPF STRUCT_OPS for test purpose.

Loading a bpf_dummy_ops implementation from userspace is prohibited,
and its only purpose is to run BPF_PROG_TYPE_STRUCT_OPS program
through bpf(BPF_PROG_TEST_RUN). Now programs for test_1() & test_2()
are supported. The following three cases are exercised in
bpf_dummy_struct_ops_test_run():

(1) test and check the value returned from state arg in test_1(state)
The content of state is copied from userspace pointer and copied back
after calling test_1(state). The user pointer is saved in an u64 array
and the array address is passed through ctx_in.

(2) test and check the return value of test_1(NULL)
Just simulate the case in which an invalid input argument is passed in.

(3) test multiple arguments passing in test_2(state, ...)
5 arguments are passed through ctx_in in form of u64 array. The first
element of array is userspace pointer of state and others 4 arguments
follow.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20211025064025.2567443-4-houtao1@huawei.com
---
 include/linux/bpf.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 67f71e7def56..c098089c1b54 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1018,6 +1018,22 @@ static inline void bpf_module_put(const void *data, struct module *owner)
 	else
 		module_put(owner);
 }
+
+#ifdef CONFIG_NET
+/* Define it here to avoid the use of forward declaration */
+struct bpf_dummy_ops_state {
+	int val;
+};
+
+struct bpf_dummy_ops {
+	int (*test_1)(struct bpf_dummy_ops_state *cb);
+	int (*test_2)(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2,
+		      char a3, unsigned long a4);
+};
+
+int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
+			    union bpf_attr __user *uattr);
+#endif
 #else
 static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
 {
-- 
cgit v1.2.3


From 8845b4681bf44b9d2d2badf2c67cf476e42a86bd Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannekoong@fb.com>
Date: Fri, 29 Oct 2021 15:49:08 -0700
Subject: bpf: Add alignment padding for "map_extra" + consolidate holes

This patch makes 2 changes regarding alignment padding
for the "map_extra" field.

1) In the kernel header, "map_extra" and "btf_value_type_id"
are rearranged to consolidate the hole.

Before:
struct bpf_map {
	...
        u32		max_entries;	/*    36     4	*/
        u32		map_flags;	/*    40     4	*/

        /* XXX 4 bytes hole, try to pack */

        u64		map_extra;	/*    48     8	*/
        int		spin_lock_off;	/*    56     4	*/
        int		timer_off;	/*    60     4	*/
        /* --- cacheline 1 boundary (64 bytes) --- */
        u32		id;		/*    64     4	*/
        int		numa_node;	/*    68     4	*/
	...
        bool		frozen;		/*   117     1	*/

        /* XXX 10 bytes hole, try to pack */

        /* --- cacheline 2 boundary (128 bytes) --- */
	...
        struct work_struct	work;	/*   144    72	*/

        /* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */
	struct mutex	freeze_mutex;	/*   216   144 	*/

        /* --- cacheline 5 boundary (320 bytes) was 40 bytes ago --- */
        u64		writecnt; 	/*   360     8	*/

    /* size: 384, cachelines: 6, members: 26 */
    /* sum members: 354, holes: 2, sum holes: 14 */
    /* padding: 16 */
    /* forced alignments: 2, forced holes: 1, sum forced holes: 10 */

} __attribute__((__aligned__(64)));

After:
struct bpf_map {
	...
        u32		max_entries;	/*    36     4	*/
        u64		map_extra;	/*    40     8 	*/
        u32		map_flags;	/*    48     4	*/
        int		spin_lock_off;	/*    52     4	*/
        int		timer_off;	/*    56     4	*/
        u32		id;		/*    60     4	*/

        /* --- cacheline 1 boundary (64 bytes) --- */
        int		numa_node;	/*    64     4	*/
	...
	bool		frozen		/*   113     1  */

        /* XXX 14 bytes hole, try to pack */

        /* --- cacheline 2 boundary (128 bytes) --- */
	...
        struct work_struct	work;	/*   144    72	*/

        /* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */
        struct mutex	freeze_mutex;	/*   216   144	*/

        /* --- cacheline 5 boundary (320 bytes) was 40 bytes ago --- */
        u64		writecnt;       /*   360     8	*/

    /* size: 384, cachelines: 6, members: 26 */
    /* sum members: 354, holes: 1, sum holes: 14 */
    /* padding: 16 */
    /* forced alignments: 2, forced holes: 1, sum forced holes: 14 */

} __attribute__((__aligned__(64)));

2) Add alignment padding to the bpf_map_info struct
More details can be found in commit 36f9814a494a ("bpf: fix uapi hole
for 32 bit compat applications")

Signed-off-by: Joanne Koong <joannekoong@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211029224909.1721024-3-joannekoong@fb.com
---
 include/linux/bpf.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c098089c1b54..f6743d4bb531 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -168,23 +168,23 @@ struct bpf_map {
 	u32 key_size;
 	u32 value_size;
 	u32 max_entries;
-	u32 map_flags;
 	u64 map_extra; /* any per-map-type extra fields */
+	u32 map_flags;
 	int spin_lock_off; /* >=0 valid offset, <0 error */
 	int timer_off; /* >=0 valid offset, <0 error */
 	u32 id;
 	int numa_node;
 	u32 btf_key_type_id;
 	u32 btf_value_type_id;
+	u32 btf_vmlinux_value_type_id;
 	struct btf *btf;
 #ifdef CONFIG_MEMCG_KMEM
 	struct mem_cgroup *memcg;
 #endif
 	char name[BPF_OBJ_NAME_LEN];
-	u32 btf_vmlinux_value_type_id;
 	bool bypass_spec_v1;
 	bool frozen; /* write-once; write-protected by freeze_mutex */
-	/* 22 bytes hole */
+	/* 14 bytes hole */
 
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
-- 
cgit v1.2.3


From e66435936756d9bce96433be183358a8994a0f0d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 1 Nov 2021 14:56:37 -0700
Subject: mm: fix mismerge of folio page flag manipulators

I had missed a semantic conflict between commit d389a4a81155 ("mm: Add
folio flag manipulation functions") from the folio tree, and commit
eac96c3efdb5 ("mm: filemap: check if THP has hwpoisoned subpage for PMD
page fault") that added a new set of page flags.

My build tests had too many options enabled, which hid this issue.  But
if you didn't have MEMORY_FAILURE or TRANSPARENT_HUGEPAGE enabled, you'd
end up with build errors like this:

  include/linux/page-flags.h:806:29: error: macro "PAGEFLAG_FALSE" requires 2 arguments, but only 1 given
    806 | PAGEFLAG_FALSE(HasHWPoisoned)
        |                             ^

due to the missing lowercase name used for folio function naming.

Fixes: 49f8275c7d92 ("Merge tag 'folio-5.16' of git://git.infradead.org/users/willy/pagecache")
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Reported-by: Yang Shi <shy828301@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d8623d6e1141..981341a3c3c4 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -803,8 +803,8 @@ PAGEFLAG_FALSE(DoubleMap, double_map)
 PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
 	TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
 #else
-PAGEFLAG_FALSE(HasHWPoisoned)
-	TESTSCFLAG_FALSE(HasHWPoisoned)
+PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
+	TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
 #endif
 
 /*
-- 
cgit v1.2.3


From f1a456f8f3fc5828d8abcad941860380ae147b1d Mon Sep 17 00:00:00 2001
From: Talal Ahmad <talalahmad@google.com>
Date: Fri, 29 Oct 2021 22:05:42 -0400
Subject: net: avoid double accounting for pure zerocopy skbs

Track skbs with only zerocopy data and avoid charging them to kernel
memory to correctly account the memory utilization for msg_zerocopy.
All of the data in such skbs is held in user pages which are already
accounted to user. Before this change, they are charged again in
kernel in __zerocopy_sg_from_iter. The charging in kernel is
excessive because data is not being copied into skb frags. This
excessive charging can lead to kernel going into memory pressure
state which impacts all sockets in the system adversely. Mark pure
zerocopy skbs with a SKBFL_PURE_ZEROCOPY flag and remove
charge/uncharge for data in such skbs.

Initially, an skb is marked pure zerocopy when it is empty and in
zerocopy path. skb can then change from a pure zerocopy skb to mixed
data skb (zerocopy and copy data) if it is at tail of write queue and
there is room available in it and non-zerocopy data is being sent in
the next sendmsg call. At this time sk_mem_charge is done for the pure
zerocopied data and the pure zerocopy flag is unmarked. We found that
this happens very rarely on workloads that pass MSG_ZEROCOPY.

A pure zerocopy skb can later be coalesced into normal skb if they are
next to each other in queue but this patch prevents coalescing from
happening. This avoids complexity of charging when skb downgrades from
pure zerocopy to mixed. This is also rare.

In sk_wmem_free_skb, if it is a pure zerocopy skb, an sk_mem_uncharge
for SKB_TRUESIZE(MAX_TCP_HEADER) is done for sk_mem_charge in
tcp_skb_entail for an skb without data.

Testing with the msg_zerocopy.c benchmark between two hosts(100G nics)
with zerocopy showed that before this patch the 'sock' variable in
memory.stat for cgroup2 that tracks sum of sk_forward_alloc,
sk_rmem_alloc and sk_wmem_queued is around 1822720 and with this
change it is 0. This is due to no charge to sk_forward_alloc for
zerocopy data and shows memory utilization for kernel is lowered.

Signed-off-by: Talal Ahmad <talalahmad@google.com>
Acked-by: Arjun Roy <arjunroy@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0bd6520329f6..10869906cc57 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -454,9 +454,15 @@ enum {
 	 * all frags to avoid possible bad checksum
 	 */
 	SKBFL_SHARED_FRAG = BIT(1),
+
+	/* segment contains only zerocopy data and should not be
+	 * charged to the kernel memory.
+	 */
+	SKBFL_PURE_ZEROCOPY = BIT(2),
 };
 
 #define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
+#define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
 
 /*
  * The callback notifies userspace to release buffers when skb DMA is done in
@@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
+static inline bool skb_zcopy_pure(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
+}
+
+static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
+				       const struct sk_buff *skb2)
+{
+	return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
+}
+
 static inline void net_zcopy_get(struct ubuf_info *uarg)
 {
 	refcount_inc(&uarg->refcnt);
@@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
 		if (!skb_zcopy_is_nouarg(skb))
 			uarg->callback(skb, uarg, zerocopy_success);
 
-		skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG;
+		skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
 	}
 }
 
-- 
cgit v1.2.3


From fcdb44d08a95003c3d040aecdee286156ec6f34e Mon Sep 17 00:00:00 2001
From: James Prestwood <prestwoj@gmail.com>
Date: Mon, 1 Nov 2021 10:36:28 -0700
Subject: net: arp: introduce arp_evict_nocarrier sysctl parameter

This change introduces a new sysctl parameter, arp_evict_nocarrier.
When set (default) the ARP cache will be cleared on a NOCARRIER event.
This new option has been defaulted to '1' which maintains existing
behavior.

Clearing the ARP cache on NOCARRIER is relatively new, introduced by:

commit 859bd2ef1fc1110a8031b967ee656c53a6260a76
Author: David Ahern <dsahern@gmail.com>
Date:   Thu Oct 11 20:33:49 2018 -0700

    net: Evict neighbor entries on carrier down

The reason for this changes is to prevent the ARP cache from being
cleared when a wireless device roams. Specifically for wireless roams
the ARP cache should not be cleared because the underlying network has not
changed. Clearing the ARP cache in this case can introduce significant
delays sending out packets after a roam.

A user reported such a situation here:

https://lore.kernel.org/linux-wireless/CACsRnHWa47zpx3D1oDq9JYnZWniS8yBwW1h0WAVZ6vrbwL_S0w@mail.gmail.com/

After some investigation it was found that the kernel was holding onto
packets until ARP finished which resulted in this 1 second delay. It
was also found that the first ARP who-has was never responded to,
which is actually what caues the delay. This change is more or less
working around this behavior, but again, there is no reason to clear
the cache on a roam anyways.

As for the unanswered who-has, we know the packet made it OTA since
it was seen while monitoring. Why it never received a response is
unknown. In any case, since this is a problem on the AP side of things
all that can be done is to work around it until it is solved.

Some background on testing/reproducing the packet delay:

Hardware:
 - 2 access points configured for Fast BSS Transition (Though I don't
   see why regular reassociation wouldn't have the same behavior)
 - Wireless station running IWD as supplicant
 - A device on network able to respond to pings (I used one of the APs)

Procedure:
 - Connect to first AP
 - Ping once to establish an ARP entry
 - Start a tcpdump
 - Roam to second AP
 - Wait for operstate UP event, and note the timestamp
 - Start pinging

Results:

Below is the tcpdump after UP. It was recorded the interface went UP at
10:42:01.432875.

10:42:01.461871 ARP, Request who-has 192.168.254.1 tell 192.168.254.71, length 28
10:42:02.497976 ARP, Request who-has 192.168.254.1 tell 192.168.254.71, length 28
10:42:02.507162 ARP, Reply 192.168.254.1 is-at ac:86:74:55:b0:20, length 46
10:42:02.507185 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 1, length 64
10:42:02.507205 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 2, length 64
10:42:02.507212 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 3, length 64
10:42:02.507219 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 4, length 64
10:42:02.507225 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 5, length 64
10:42:02.507232 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 6, length 64
10:42:02.515373 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 1, length 64
10:42:02.521399 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 2, length 64
10:42:02.521612 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 3, length 64
10:42:02.521941 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 4, length 64
10:42:02.522419 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 5, length 64
10:42:02.523085 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 6, length 64

You can see the first ARP who-has went out very quickly after UP, but
was never responded to. Nearly a second later the kernel retries and
gets a response. Only then do the ping packets go out. If an ARP entry
is manually added prior to UP (after the cache is cleared) it is seen
that the first ping is never responded to, so its not only an issue with
ARP but with data packets in general.

As mentioned prior, the wireless interface was also monitored to verify
the ping/ARP packet made it OTA which was observed to be true.

Signed-off-by: James Prestwood <prestwoj@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/inetdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index a038feb63f23..518b484a7f07 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -133,6 +133,8 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 #define IN_DEV_ARP_ANNOUNCE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)
 #define IN_DEV_ARP_IGNORE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_IGNORE)
 #define IN_DEV_ARP_NOTIFY(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)
+#define IN_DEV_ARP_EVICT_NOCARRIER(in_dev) IN_DEV_ANDCONF((in_dev), \
+							  ARP_EVICT_NOCARRIER)
 
 struct in_ifaddr {
 	struct hlist_node	hash;
-- 
cgit v1.2.3


From 18ac597af25e9760b76471524096f5b29eb820e6 Mon Sep 17 00:00:00 2001
From: James Prestwood <prestwoj@gmail.com>
Date: Mon, 1 Nov 2021 10:36:29 -0700
Subject: net: ndisc: introduce ndisc_evict_nocarrier sysctl parameter

In most situations the neighbor discovery cache should be cleared on a
NOCARRIER event which is currently done unconditionally. But for wireless
roams the neighbor discovery cache can and should remain intact since
the underlying network has not changed.

This patch introduces a sysctl option ndisc_evict_nocarrier which can
be disabled by a wireless supplicant during a roam. This allows packets
to be sent after a roam immediately without having to wait for
neighbor discovery.

A user reported roughly a 1 second delay after a roam before packets
could be sent out (note, on IPv4). This delay was due to the ARP
cache being cleared. During testing of this same scenario using IPv6
no delay was noticed, but regardless there is no reason to clear
the ndisc cache for wireless roams.

Signed-off-by: James Prestwood <prestwoj@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index c383630d3f06..20c1f968da7c 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -79,6 +79,7 @@ struct ipv6_devconf {
 	__u32		ioam6_id;
 	__u32		ioam6_id_wide;
 	__u8		ioam6_enabled;
+	__u8		ndisc_evict_nocarrier;
 
 	struct ctl_table_header *sysctl_header;
 };
-- 
cgit v1.2.3


From 4a08e3271c55f8b5d56906a8aa5bd041911cf897 Mon Sep 17 00:00:00 2001
From: "Hector.Yuan" <hector.yuan@mediatek.com>
Date: Mon, 4 Oct 2021 22:42:33 +0800
Subject: cpufreq: Fix parameter in parse_perf_domain()

Pass cpu to parse_perf_domain() instead of pcpu.

Fixes: 8486a32dd484 ("cpufreq: Add of_perf_domain_get_sharing_cpumask")
Signed-off-by: Hector.Yuan <hector.yuan@mediatek.com>
[ Viresh: Massaged changelog ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/cpufreq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index ff88bb3e44fc..66a1f495f01a 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1041,7 +1041,7 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_
 		if (cpu == pcpu)
 			continue;
 
-		ret = parse_perf_domain(pcpu, list_name, cell_name);
+		ret = parse_perf_domain(cpu, list_name, cell_name);
 		if (ret < 0)
 			continue;
 
-- 
cgit v1.2.3


From 84882cf72cd774cf16fd338bdbf00f69ac9f9194 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 1 Nov 2021 22:26:08 -0700
Subject: Revert "net: avoid double accounting for pure zerocopy skbs"

This reverts commit f1a456f8f3fc5828d8abcad941860380ae147b1d.

  WARNING: CPU: 1 PID: 6819 at net/core/skbuff.c:5429 skb_try_coalesce+0x78b/0x7e0
  CPU: 1 PID: 6819 Comm: xxxxxxx Kdump: loaded Tainted: G S                5.15.0-04194-gd852503f7711 #16
  RIP: 0010:skb_try_coalesce+0x78b/0x7e0
  Code: e8 2a bf 41 ff 44 8b b3 bc 00 00 00 48 8b 7c 24 30 e8 19 c0 41 ff 44 89 f0 48 03 83 c0 00 00 00 48 89 44 24 40 e9 47 fb ff ff <0f> 0b e9 ca fc ff ff 4c 8d 70 ff 48 83 c0 07 48 89 44 24 38 e9 61
  RSP: 0018:ffff88881f449688 EFLAGS: 00010282
  RAX: 00000000fffffe96 RBX: ffff8881566e4460 RCX: ffffffff82079f7e
  RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8881566e47b0
  RBP: ffff8881566e46e0 R08: ffffed102619235d R09: ffffed102619235d
  R10: ffff888130c91ae3 R11: ffffed102619235c R12: ffff88881f4498a0
  R13: 0000000000000056 R14: 0000000000000009 R15: ffff888130c91ac0
  FS:  00007fec2cbb9700(0000) GS:ffff88881f440000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007fec1b060d80 CR3: 00000003acf94005 CR4: 00000000003706e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   <IRQ>
   tcp_try_coalesce+0xeb/0x290
   ? tcp_parse_options+0x610/0x610
   ? mark_held_locks+0x79/0xa0
   tcp_queue_rcv+0x69/0x2f0
   tcp_rcv_established+0xa49/0xd40
   ? tcp_data_queue+0x18a0/0x18a0
   tcp_v6_do_rcv+0x1c9/0x880
   ? rt6_mtu_change_route+0x100/0x100
   tcp_v6_rcv+0x1624/0x1830

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 10869906cc57..0bd6520329f6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -454,15 +454,9 @@ enum {
 	 * all frags to avoid possible bad checksum
 	 */
 	SKBFL_SHARED_FRAG = BIT(1),
-
-	/* segment contains only zerocopy data and should not be
-	 * charged to the kernel memory.
-	 */
-	SKBFL_PURE_ZEROCOPY = BIT(2),
 };
 
 #define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
-#define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
 
 /*
  * The callback notifies userspace to release buffers when skb DMA is done in
@@ -1470,17 +1464,6 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
-static inline bool skb_zcopy_pure(const struct sk_buff *skb)
-{
-	return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
-}
-
-static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
-				       const struct sk_buff *skb2)
-{
-	return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
-}
-
 static inline void net_zcopy_get(struct ubuf_info *uarg)
 {
 	refcount_inc(&uarg->refcnt);
@@ -1545,7 +1528,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
 		if (!skb_zcopy_is_nouarg(skb))
 			uarg->callback(skb, uarg, zerocopy_success);
 
-		skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
+		skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG;
 	}
 }
 
-- 
cgit v1.2.3


From ca7752caeaa70bd31d1714af566c9809688544af Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 1 Nov 2021 17:06:15 -0400
Subject: posix-cpu-timers: Clear task::posix_cputimers_work in copy_process()

copy_process currently copies task_struct.posix_cputimers_work as-is. If a
timer interrupt arrives while handling clone and before dup_task_struct
completes then the child task will have:

1. posix_cputimers_work.scheduled = true
2. posix_cputimers_work.work queued.

copy_process clears task_struct.task_works, so (2) will have no effect and
posix_cpu_timers_work will never run (not to mention it doesn't make sense
for two tasks to share a common linked list).

Since posix_cpu_timers_work never runs, posix_cputimers_work.scheduled is
never cleared. Since scheduled is set, future timer interrupts will skip
scheduling work, with the ultimate result that the task will never receive
timer expirations.

Together, the complete flow is:

1. Task 1 calls clone(), enters kernel.
2. Timer interrupt fires, schedules task work on Task 1.
   2a. task_struct.posix_cputimers_work.scheduled = true
   2b. task_struct.posix_cputimers_work.work added to
       task_struct.task_works.
3. dup_task_struct() copies Task 1 to Task 2.
4. copy_process() clears task_struct.task_works for Task 2.
5. Future timer interrupts on Task 2 see
   task_struct.posix_cputimers_work.scheduled = true and skip scheduling
   work.

Fix this by explicitly clearing contents of task_struct.posix_cputimers_work
in copy_process(). This was never meant to be shared or inherited across
tasks in the first place.

Fixes: 1fb497dd0030 ("posix-cpu-timers: Provide mechanisms to defer timer handling to task_work")
Reported-by: Rhys Hiltner <rhys@justin.tv>
Signed-off-by: Michael Pratt <mpratt@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20211101210615.716522-1-mpratt@google.com
---
 include/linux/posix-timers.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 00fef0064355..5bbcd280bfd2 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -184,8 +184,10 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
 #endif
 
 #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
+void clear_posix_cputimers_work(struct task_struct *p);
 void posix_cputimers_init_work(void);
 #else
+static inline void clear_posix_cputimers_work(struct task_struct *p) { }
 static inline void posix_cputimers_init_work(void) { }
 #endif
 
-- 
cgit v1.2.3


From 8791545eda52e8f3bc48e3cd902e38bf4ba4c9de Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 22 Oct 2021 16:17:03 -0400
Subject: NFS: Move NFS protocol display macros to global header

Refactor: surface useful show_ macros so they can be shared between
the client and server trace code.

Additional clean up:
- Housekeeping: ensure the correct #include files are pulled in
  and add proper TRACE_DEFINE_ENUM where they are missing
- Use a consistent naming scheme for the helpers
- Store values to be displayed symbolically as unsigned long, as
  that is the type that the __print_yada() functions take

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs4.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 15004c469807..5662d8be04eb 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -292,6 +292,10 @@ enum nfsstat4 {
 	NFS4ERR_XATTR2BIG      = 10096,
 };
 
+/* error codes for internal client use */
+#define NFS4ERR_RESET_TO_MDS   12001
+#define NFS4ERR_RESET_TO_PNFS  12002
+
 static inline bool seqid_mutating_err(u32 err)
 {
 	/* See RFC 7530, section 9.1.7 */
-- 
cgit v1.2.3


From c4777efa751d293e369aec464ce6875e957be255 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 1 Nov 2021 17:45:55 -0700
Subject: net: add and use skb_unclone_keeptruesize() helper

While commit 097b9146c0e2 ("net: fix up truesize of cloned
skb in skb_prepare_for_shift()") fixed immediate issues found
when KFENCE was enabled/tested, there are still similar issues,
when tcp_trim_head() hits KFENCE while the master skb
is cloned.

This happens under heavy networking TX workloads,
when the TX completion might be delayed after incoming ACK.

This patch fixes the WARNING in sk_stream_kill_queues
when sk->sk_mem_queued/sk->sk_forward_alloc are not zero.

Fixes: d3fb45f370d9 ("mm, kfence: insert KFENCE hooks for SLAB")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/r/20211102004555.1359210-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0bd6520329f6..a63e13082397 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1675,6 +1675,22 @@ static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
 	return 0;
 }
 
+/* This variant of skb_unclone() makes sure skb->truesize is not changed */
+static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
+{
+	might_sleep_if(gfpflags_allow_blocking(pri));
+
+	if (skb_cloned(skb)) {
+		unsigned int save = skb->truesize;
+		int res;
+
+		res = pskb_expand_head(skb, 0, 0, pri);
+		skb->truesize = save;
+		return res;
+	}
+	return 0;
+}
+
 /**
  *	skb_header_cloned - is the header a clone
  *	@skb: buffer to check
-- 
cgit v1.2.3


From 0dc54bd4d6e03be1f0b678c4297170b79f1a44ab Mon Sep 17 00:00:00 2001
From: Dominique Martinet <asmadeus@codewreck.org>
Date: Wed, 3 Nov 2021 17:34:05 +0900
Subject: fscache_cookie_enabled: check cookie is valid before accessing it

fscache_cookie_enabled() could be called on NULL cookies and cause a
null pointer dereference when accessing cookie flags: just make sure
the cookie is valid first

Suggested-by: David Howells <dhowells@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 include/linux/fscache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index a4dab5998613..3b2282c157f7 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -167,7 +167,7 @@ struct fscache_cookie {
 
 static inline bool fscache_cookie_enabled(struct fscache_cookie *cookie)
 {
-	return test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+	return fscache_cookie_valid(cookie) && test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
 }
 
 /*
-- 
cgit v1.2.3


From c081d53f97a1a90a38e4296dd3d6fda5e38dca2c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 2 Nov 2021 08:02:47 -0400
Subject: security: pass asoc to sctp_assoc_request and sctp_sk_clone

This patch is to move secid and peer_secid from endpoint to association,
and pass asoc to sctp_assoc_request and sctp_sk_clone instead of ep. As
ep is the local endpoint and asoc represents a connection, and in SCTP
one sk/ep could have multiple asoc/connection, saving secid/peer_secid
for new asoc will overwrite the old asoc's.

Note that since asoc can be passed as NULL, security_sctp_assoc_request()
is moved to the place right after the new_asoc is created in
sctp_sf_do_5_1B_init() and sctp_sf_do_unexpected_init().

v1->v2:
  - fix the description of selinux_netlbl_skbuff_setsid(), as Jakub noticed.
  - fix the annotation in selinux_sctp_assoc_request(), as Richard Noticed.

Fixes: 72e89f50084c ("security: Add support for SCTP security hooks")
Reported-by: Prashanth Prahlad <pprahlad@redhat.com>
Reviewed-by: Richard Haines <richard_c_haines@btinternet.com>
Tested-by: Richard Haines <richard_c_haines@btinternet.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/lsm_hook_defs.h |  4 ++--
 include/linux/lsm_hooks.h     |  8 ++++----
 include/linux/security.h      | 10 +++++-----
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index a9ac70ae01ab..df8de62f4710 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -329,11 +329,11 @@ LSM_HOOK(int, 0, tun_dev_create, void)
 LSM_HOOK(int, 0, tun_dev_attach_queue, void *security)
 LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security)
 LSM_HOOK(int, 0, tun_dev_open, void *security)
-LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_endpoint *ep,
+LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_association *asoc,
 	 struct sk_buff *skb)
 LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname,
 	 struct sockaddr *address, int addrlen)
-LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_endpoint *ep,
+LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc,
 	 struct sock *sk, struct sock *newsk)
 #endif /* CONFIG_SECURITY_NETWORK */
 
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 0bada4df23fc..d45b6f6e27fd 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1027,9 +1027,9 @@
  * Security hooks for SCTP
  *
  * @sctp_assoc_request:
- *	Passes the @ep and @chunk->skb of the association INIT packet to
+ *	Passes the @asoc and @chunk->skb of the association INIT packet to
  *	the security module.
- *	@ep pointer to sctp endpoint structure.
+ *	@asoc pointer to sctp association structure.
  *	@skb pointer to skbuff of association packet.
  *	Return 0 on success, error on failure.
  * @sctp_bind_connect:
@@ -1047,9 +1047,9 @@
  *	Called whenever a new socket is created by accept(2) (i.e. a TCP
  *	style socket) or when a socket is 'peeled off' e.g userspace
  *	calls sctp_peeloff(3).
- *	@ep pointer to current sctp endpoint structure.
+ *	@asoc pointer to current sctp association structure.
  *	@sk pointer to current sock structure.
- *	@sk pointer to new sock structure.
+ *	@newsk pointer to new sock structure.
  *
  * Security hooks for Infiniband
  *
diff --git a/include/linux/security.h b/include/linux/security.h
index 7e0ba63b5dde..bbf44a466832 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -179,7 +179,7 @@ struct xfrm_policy;
 struct xfrm_state;
 struct xfrm_user_sec_ctx;
 struct seq_file;
-struct sctp_endpoint;
+struct sctp_association;
 
 #ifdef CONFIG_MMU
 extern unsigned long mmap_min_addr;
@@ -1425,10 +1425,10 @@ int security_tun_dev_create(void);
 int security_tun_dev_attach_queue(void *security);
 int security_tun_dev_attach(struct sock *sk, void *security);
 int security_tun_dev_open(void *security);
-int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb);
+int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb);
 int security_sctp_bind_connect(struct sock *sk, int optname,
 			       struct sockaddr *address, int addrlen);
-void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk,
+void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 			    struct sock *newsk);
 
 #else	/* CONFIG_SECURITY_NETWORK */
@@ -1631,7 +1631,7 @@ static inline int security_tun_dev_open(void *security)
 	return 0;
 }
 
-static inline int security_sctp_assoc_request(struct sctp_endpoint *ep,
+static inline int security_sctp_assoc_request(struct sctp_association *asoc,
 					      struct sk_buff *skb)
 {
 	return 0;
@@ -1644,7 +1644,7 @@ static inline int security_sctp_bind_connect(struct sock *sk, int optname,
 	return 0;
 }
 
-static inline void security_sctp_sk_clone(struct sctp_endpoint *ep,
+static inline void security_sctp_sk_clone(struct sctp_association *asoc,
 					  struct sock *sk,
 					  struct sock *newsk)
 {
-- 
cgit v1.2.3


From 7c2ef0240e6abfd3cc59511339517358350a8910 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 2 Nov 2021 08:02:49 -0400
Subject: security: add sctp_assoc_established hook

security_sctp_assoc_established() is added to replace
security_inet_conn_established() called in
sctp_sf_do_5_1E_ca(), so that asoc can be accessed in security
subsystem and save the peer secid to asoc->peer_secid.

v1->v2:
  - fix the return value of security_sctp_assoc_established() in
    security.h, found by kernel test robot and Ondrej.

Fixes: 72e89f50084c ("security: Add support for SCTP security hooks")
Reported-by: Prashanth Prahlad <pprahlad@redhat.com>
Reviewed-by: Richard Haines <richard_c_haines@btinternet.com>
Tested-by: Richard Haines <richard_c_haines@btinternet.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/lsm_hook_defs.h | 2 ++
 include/linux/lsm_hooks.h     | 5 +++++
 include/linux/security.h      | 7 +++++++
 3 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index df8de62f4710..442a611fa0fb 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -335,6 +335,8 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname,
 	 struct sockaddr *address, int addrlen)
 LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc,
 	 struct sock *sk, struct sock *newsk)
+LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc,
+	 struct sk_buff *skb)
 #endif /* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index d45b6f6e27fd..d6823214d5c1 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1050,6 +1050,11 @@
  *	@asoc pointer to current sctp association structure.
  *	@sk pointer to current sock structure.
  *	@newsk pointer to new sock structure.
+ * @sctp_assoc_established:
+ *	Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet
+ *	to the security module.
+ *	@asoc pointer to sctp association structure.
+ *	@skb pointer to skbuff of association packet.
  *
  * Security hooks for Infiniband
  *
diff --git a/include/linux/security.h b/include/linux/security.h
index bbf44a466832..06eac4e61a13 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1430,6 +1430,8 @@ int security_sctp_bind_connect(struct sock *sk, int optname,
 			       struct sockaddr *address, int addrlen);
 void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 			    struct sock *newsk);
+void security_sctp_assoc_established(struct sctp_association *asoc,
+				     struct sk_buff *skb);
 
 #else	/* CONFIG_SECURITY_NETWORK */
 static inline int security_unix_stream_connect(struct sock *sock,
@@ -1649,6 +1651,11 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc,
 					  struct sock *newsk)
 {
 }
+
+static inline void security_sctp_assoc_established(struct sctp_association *asoc,
+						   struct sk_buff *skb)
+{
+}
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
-- 
cgit v1.2.3


From 9b65b17db72313b7a4fe9bc9502928c88be57986 Mon Sep 17 00:00:00 2001
From: Talal Ahmad <talalahmad@google.com>
Date: Tue, 2 Nov 2021 22:58:44 -0400
Subject: net: avoid double accounting for pure zerocopy skbs

Track skbs containing only zerocopy data and avoid charging them to
kernel memory to correctly account the memory utilization for
msg_zerocopy. All of the data in such skbs is held in user pages which
are already accounted to user. Before this change, they are charged
again in kernel in __zerocopy_sg_from_iter. The charging in kernel is
excessive because data is not being copied into skb frags. This
excessive charging can lead to kernel going into memory pressure
state which impacts all sockets in the system adversely. Mark pure
zerocopy skbs with a SKBFL_PURE_ZEROCOPY flag and remove
charge/uncharge for data in such skbs.

Initially, an skb is marked pure zerocopy when it is empty and in
zerocopy path. skb can then change from a pure zerocopy skb to mixed
data skb (zerocopy and copy data) if it is at tail of write queue and
there is room available in it and non-zerocopy data is being sent in
the next sendmsg call. At this time sk_mem_charge is done for the pure
zerocopied data and the pure zerocopy flag is unmarked. We found that
this happens very rarely on workloads that pass MSG_ZEROCOPY.

A pure zerocopy skb can later be coalesced into normal skb if they are
next to each other in queue but this patch prevents coalescing from
happening. This avoids complexity of charging when skb downgrades from
pure zerocopy to mixed. This is also rare.

In sk_wmem_free_skb, if it is a pure zerocopy skb, an sk_mem_uncharge
for SKB_TRUESIZE(skb_end_offset(skb)) is done for sk_mem_charge in
tcp_skb_entail for an skb without data.

Testing with the msg_zerocopy.c benchmark between two hosts(100G nics)
with zerocopy showed that before this patch the 'sock' variable in
memory.stat for cgroup2 that tracks sum of sk_forward_alloc,
sk_rmem_alloc and sk_wmem_queued is around 1822720 and with this
change it is 0. This is due to no charge to sk_forward_alloc for
zerocopy data and shows memory utilization for kernel is lowered.

With this commit we don't see the warning we saw in previous commit
which resulted in commit 84882cf72cd774cf16fd338bdbf00f69ac9f9194.

Signed-off-by: Talal Ahmad <talalahmad@google.com>
Acked-by: Arjun Roy <arjunroy@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a63e13082397..686a666d073d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -454,9 +454,15 @@ enum {
 	 * all frags to avoid possible bad checksum
 	 */
 	SKBFL_SHARED_FRAG = BIT(1),
+
+	/* segment contains only zerocopy data and should not be
+	 * charged to the kernel memory.
+	 */
+	SKBFL_PURE_ZEROCOPY = BIT(2),
 };
 
 #define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
+#define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
 
 /*
  * The callback notifies userspace to release buffers when skb DMA is done in
@@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
+static inline bool skb_zcopy_pure(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
+}
+
+static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
+				       const struct sk_buff *skb2)
+{
+	return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
+}
+
 static inline void net_zcopy_get(struct ubuf_info *uarg)
 {
 	refcount_inc(&uarg->refcnt);
@@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
 		if (!skb_zcopy_is_nouarg(skb))
 			uarg->callback(skb, uarg, zerocopy_success);
 
-		skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG;
+		skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
 	}
 }
 
-- 
cgit v1.2.3


From 1aabe578dd86e9f2867c4db4fba9a15f4ba1825d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 2 Nov 2021 15:02:36 -0700
Subject: ethtool: fix ethtool msg len calculation for pause stats

ETHTOOL_A_PAUSE_STAT_MAX is the MAX attribute id,
so we need to subtract non-stats and add one to
get a count (IOW -2+1 == -1).

Otherwise we'll see:

  ethnl cmd 21: calculated reply length 40, but consumed 52

Fixes: 9a27a33027f2 ("ethtool: add standard pause stats")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool_netlink.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h
index 1e7bf78cb382..aba348d58ff6 100644
--- a/include/linux/ethtool_netlink.h
+++ b/include/linux/ethtool_netlink.h
@@ -10,6 +10,9 @@
 #define __ETHTOOL_LINK_MODE_MASK_NWORDS \
 	DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32)
 
+#define ETHTOOL_PAUSE_STAT_CNT	(__ETHTOOL_A_PAUSE_STAT_CNT -		\
+				 ETHTOOL_A_PAUSE_STAT_TX_FRAMES)
+
 enum ethtool_multicast_groups {
 	ETHNL_MCGRP_MONITOR,
 };
-- 
cgit v1.2.3


From 4330fe35b8213e92ff51907b4cb6323be943a9ad Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 2 Nov 2021 16:01:56 -0600
Subject: nfs: remove unused header <linux/pnfs_osd_xdr.h>

Commit 19fcae3d4f2dd ("scsi: remove the SCSI OSD library") deleted the last
file that included <linux/pnfs_osd_xdr.h> but left that file behind.  It's
unused, get rid of it now.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Anna Schumaker <anna.schumaker@netapp.com>
Cc: linux-nfs@vger.kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/pnfs_osd_xdr.h | 317 -------------------------------------------
 1 file changed, 317 deletions(-)
 delete mode 100644 include/linux/pnfs_osd_xdr.h

(limited to 'include/linux')

diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h
deleted file mode 100644
index 17d7d0d20eca..000000000000
--- a/include/linux/pnfs_osd_xdr.h
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- *  pNFS-osd on-the-wire data structures
- *
- *  Copyright (C) 2007 Panasas Inc. [year of first publication]
- *  All rights reserved.
- *
- *  Benny Halevy <bhalevy@panasas.com>
- *  Boaz Harrosh <ooo@electrozaur.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  See the file COPYING included with this distribution for more details.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *  1. Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *  3. Neither the name of the Panasas company nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#ifndef __PNFS_OSD_XDR_H__
-#define __PNFS_OSD_XDR_H__
-
-#include <linux/nfs_fs.h>
-
-/*
- * draft-ietf-nfsv4-minorversion-22
- * draft-ietf-nfsv4-pnfs-obj-12
- */
-
-/* Layout Structure */
-
-enum pnfs_osd_raid_algorithm4 {
-	PNFS_OSD_RAID_0		= 1,
-	PNFS_OSD_RAID_4		= 2,
-	PNFS_OSD_RAID_5		= 3,
-	PNFS_OSD_RAID_PQ	= 4     /* Reed-Solomon P+Q */
-};
-
-/*   struct pnfs_osd_data_map4 {
- *       uint32_t                    odm_num_comps;
- *       length4                     odm_stripe_unit;
- *       uint32_t                    odm_group_width;
- *       uint32_t                    odm_group_depth;
- *       uint32_t                    odm_mirror_cnt;
- *       pnfs_osd_raid_algorithm4    odm_raid_algorithm;
- *   };
- */
-struct pnfs_osd_data_map {
-	u32	odm_num_comps;
-	u64	odm_stripe_unit;
-	u32	odm_group_width;
-	u32	odm_group_depth;
-	u32	odm_mirror_cnt;
-	u32	odm_raid_algorithm;
-};
-
-/*   struct pnfs_osd_objid4 {
- *       deviceid4       oid_device_id;
- *       uint64_t        oid_partition_id;
- *       uint64_t        oid_object_id;
- *   };
- */
-struct pnfs_osd_objid {
-	struct nfs4_deviceid	oid_device_id;
-	u64			oid_partition_id;
-	u64			oid_object_id;
-};
-
-/* For printout. I use:
- * kprint("dev(%llx:%llx)", _DEVID_LO(pointer), _DEVID_HI(pointer));
- * BE style
- */
-#define _DEVID_LO(oid_device_id) \
-	(unsigned long long)be64_to_cpup((__be64 *)(oid_device_id)->data)
-
-#define _DEVID_HI(oid_device_id) \
-	(unsigned long long)be64_to_cpup(((__be64 *)(oid_device_id)->data) + 1)
-
-enum pnfs_osd_version {
-	PNFS_OSD_MISSING              = 0,
-	PNFS_OSD_VERSION_1            = 1,
-	PNFS_OSD_VERSION_2            = 2
-};
-
-struct pnfs_osd_opaque_cred {
-	u32 cred_len;
-	void *cred;
-};
-
-enum pnfs_osd_cap_key_sec {
-	PNFS_OSD_CAP_KEY_SEC_NONE     = 0,
-	PNFS_OSD_CAP_KEY_SEC_SSV      = 1,
-};
-
-/*   struct pnfs_osd_object_cred4 {
- *       pnfs_osd_objid4         oc_object_id;
- *       pnfs_osd_version4       oc_osd_version;
- *       pnfs_osd_cap_key_sec4   oc_cap_key_sec;
- *       opaque                  oc_capability_key<>;
- *       opaque                  oc_capability<>;
- *   };
- */
-struct pnfs_osd_object_cred {
-	struct pnfs_osd_objid		oc_object_id;
-	u32				oc_osd_version;
-	u32				oc_cap_key_sec;
-	struct pnfs_osd_opaque_cred	oc_cap_key;
-	struct pnfs_osd_opaque_cred	oc_cap;
-};
-
-/*   struct pnfs_osd_layout4 {
- *       pnfs_osd_data_map4      olo_map;
- *       uint32_t                olo_comps_index;
- *       pnfs_osd_object_cred4   olo_components<>;
- *   };
- */
-struct pnfs_osd_layout {
-	struct pnfs_osd_data_map	olo_map;
-	u32				olo_comps_index;
-	u32				olo_num_comps;
-	struct pnfs_osd_object_cred	*olo_comps;
-};
-
-/* Device Address */
-enum pnfs_osd_targetid_type {
-	OBJ_TARGET_ANON = 1,
-	OBJ_TARGET_SCSI_NAME = 2,
-	OBJ_TARGET_SCSI_DEVICE_ID = 3,
-};
-
-/*   union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) {
- *       case OBJ_TARGET_SCSI_NAME:
- *           string              oti_scsi_name<>;
- *
- *       case OBJ_TARGET_SCSI_DEVICE_ID:
- *           opaque              oti_scsi_device_id<>;
- *
- *       default:
- *           void;
- *   };
- *
- *   union pnfs_osd_targetaddr4 switch (bool ota_available) {
- *       case TRUE:
- *           netaddr4            ota_netaddr;
- *       case FALSE:
- *           void;
- *   };
- *
- *   struct pnfs_osd_deviceaddr4 {
- *       pnfs_osd_targetid4      oda_targetid;
- *       pnfs_osd_targetaddr4    oda_targetaddr;
- *       uint64_t                oda_lun;
- *       opaque                  oda_systemid<>;
- *       pnfs_osd_object_cred4   oda_root_obj_cred;
- *       opaque                  oda_osdname<>;
- *   };
- */
-struct pnfs_osd_targetid {
-	u32				oti_type;
-	struct nfs4_string		oti_scsi_device_id;
-};
-
-/*   struct netaddr4 {
- *       // see struct rpcb in RFC1833
- *       string r_netid<>;    // network id
- *       string r_addr<>;     // universal address
- *   };
- */
-struct pnfs_osd_net_addr {
-	struct nfs4_string	r_netid;
-	struct nfs4_string	r_addr;
-};
-
-struct pnfs_osd_targetaddr {
-	u32				ota_available;
-	struct pnfs_osd_net_addr	ota_netaddr;
-};
-
-struct pnfs_osd_deviceaddr {
-	struct pnfs_osd_targetid	oda_targetid;
-	struct pnfs_osd_targetaddr	oda_targetaddr;
-	u8				oda_lun[8];
-	struct nfs4_string		oda_systemid;
-	struct pnfs_osd_object_cred	oda_root_obj_cred;
-	struct nfs4_string		oda_osdname;
-};
-
-/* LAYOUTCOMMIT: layoutupdate */
-
-/*   union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) {
- *       case TRUE:
- *           int64_t     dsu_delta;
- *       case FALSE:
- *           void;
- *   };
- *
- *   struct pnfs_osd_layoutupdate4 {
- *       pnfs_osd_deltaspaceused4    olu_delta_space_used;
- *       bool                        olu_ioerr_flag;
- *   };
- */
-struct pnfs_osd_layoutupdate {
-	u32	dsu_valid;
-	s64	dsu_delta;
-	u32	olu_ioerr_flag;
-};
-
-/* LAYOUTRETURN: I/O Rrror Report */
-
-enum pnfs_osd_errno {
-	PNFS_OSD_ERR_EIO		= 1,
-	PNFS_OSD_ERR_NOT_FOUND		= 2,
-	PNFS_OSD_ERR_NO_SPACE		= 3,
-	PNFS_OSD_ERR_BAD_CRED		= 4,
-	PNFS_OSD_ERR_NO_ACCESS		= 5,
-	PNFS_OSD_ERR_UNREACHABLE	= 6,
-	PNFS_OSD_ERR_RESOURCE		= 7
-};
-
-/*   struct pnfs_osd_ioerr4 {
- *       pnfs_osd_objid4     oer_component;
- *       length4             oer_comp_offset;
- *       length4             oer_comp_length;
- *       bool                oer_iswrite;
- *       pnfs_osd_errno4     oer_errno;
- *   };
- */
-struct pnfs_osd_ioerr {
-	struct pnfs_osd_objid	oer_component;
-	u64			oer_comp_offset;
-	u64			oer_comp_length;
-	u32			oer_iswrite;
-	u32			oer_errno;
-};
-
-/* OSD XDR Client API */
-/* Layout helpers */
-/* Layout decoding is done in two parts:
- * 1. First Call pnfs_osd_xdr_decode_layout_map to read in only the header part
- *    of the layout. @iter members need not be initialized.
- *    Returned:
- *             @layout members are set. (@layout->olo_comps set to NULL).
- *
- *             Zero on success, or negative error if passed xdr is broken.
- *
- * 2. 2nd Call pnfs_osd_xdr_decode_layout_comp() in a loop until it returns
- *    false, to decode the next component.
- *    Returned:
- *       true if there is more to decode or false if we are done or error.
- *
- * Example:
- *	struct pnfs_osd_xdr_decode_layout_iter iter;
- *	struct pnfs_osd_layout layout;
- *	struct pnfs_osd_object_cred comp;
- *	int status;
- *
- *	status = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
- *	if (unlikely(status))
- *		goto err;
- *	while(pnfs_osd_xdr_decode_layout_comp(&comp, &iter, xdr, &status)) {
- *		// All of @comp strings point to inside the xdr_buffer
- *		// or scrach buffer. Copy them out to user memory eg.
- *		copy_single_comp(dest_comp++, &comp);
- *	}
- *	if (unlikely(status))
- *		goto err;
- */
-
-struct pnfs_osd_xdr_decode_layout_iter {
-	unsigned total_comps;
-	unsigned decoded_comps;
-};
-
-extern int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
-	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr);
-
-extern bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
-	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
-	int *err);
-
-/* Device Info helpers */
-
-/* Note: All strings inside @deviceaddr point to space inside @p.
- * @p should stay valid while @deviceaddr is in use.
- */
-extern void pnfs_osd_xdr_decode_deviceaddr(
-	struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p);
-
-/* layoutupdate (layout_commit) xdr helpers */
-extern int
-pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
-				 struct pnfs_osd_layoutupdate *lou);
-
-/* osd_ioerror encoding (layout_return) */
-extern __be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr);
-extern void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr);
-
-#endif /* __PNFS_OSD_XDR_H__ */
-- 
cgit v1.2.3


From 92f62485b3715882cd397b0cbd80a96d179b86d6 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 2 Nov 2021 21:31:22 +0200
Subject: net: dsa: felix: fix broken VLAN-tagged PTP under VLAN-aware bridge

Normally it is expected that the dsa_device_ops :: rcv() method finishes
parsing the DSA tag and consumes it, then never looks at it again.

But commit c0bcf537667c ("net: dsa: ocelot: add hardware timestamping
support for Felix") added support for RX timestamping in a very
unconventional way. On this switch, a partial timestamp is available in
the DSA header, but the driver got away with not parsing that timestamp
right away, but instead delayed that parsing for a little longer:

dsa_switch_rcv():
	nskb = cpu_dp->rcv(skb, dev); <------------- not here
	-> ocelot_rcv()
	...

	skb = nskb;
	skb_push(skb, ETH_HLEN);
	skb->pkt_type = PACKET_HOST;
	skb->protocol = eth_type_trans(skb, skb->dev);

	...

	if (dsa_skb_defer_rx_timestamp(p, skb)) <--- but here
	-> felix_rxtstamp()
		return 0;

When in felix_rxtstamp(), this driver accounted for the fact that
eth_type_trans() happened in the meanwhile, so it got a hold of the
extraction header again by subtracting (ETH_HLEN + OCELOT_TAG_LEN) bytes
from the current skb->data.

This worked for quite some time but was quite fragile from the very
beginning. Not to mention that having DSA tag parsing split in two
different files, under different folders (net/dsa/tag_ocelot.c vs
drivers/net/dsa/ocelot/felix.c) made it quite non-obvious for patches to
come that they might break this.

Finally, the blamed commit does the following: at the end of
ocelot_rcv(), it checks whether the skb payload contains a VLAN header.
If it does, and this port is under a VLAN-aware bridge, that VLAN ID
might not be correct in the sense that the packet might have suffered
VLAN rewriting due to TCAM rules (VCAP IS1). So we consume the VLAN ID
from the skb payload using __skb_vlan_pop(), and take the classified
VLAN ID from the DSA tag, and construct a hwaccel VLAN tag with the
classified VLAN, and the skb payload is VLAN-untagged.

The big problem is that __skb_vlan_pop() does:

	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
	__skb_pull(skb, VLAN_HLEN);

aka it moves the Ethernet header 4 bytes to the right, and pulls 4 bytes
from the skb headroom (effectively also moving skb->data, by definition).
So for felix_rxtstamp()'s fragile logic, all bets are off now.
Instead of having the "extraction" pointer point to the DSA header,
it actually points to 4 bytes _inside_ the extraction header.
Corollary, the last 4 bytes of the "extraction" header are in fact 4
stale bytes of the destination MAC address from the Ethernet header,
from prior to the __skb_vlan_pop() movement.

So of course, RX timestamps are completely bogus when the system is
configured in this way.

The fix is actually very simple: just don't structure the code like that.
For better or worse, the DSA PTP timestamping API does not offer a
straightforward way for drivers to present their RX timestamps, but
other drivers (sja1105) have established a simple mechanism to carry
their RX timestamp from dsa_device_ops :: rcv() all the way to
dsa_switch_ops :: port_rxtstamp() and even later. That mechanism is to
simply save the partial timestamp to the skb->cb, and complete it later.

Question: why don't we simply populate the skb's struct
skb_shared_hwtstamps from ocelot_rcv(), and bother with this
complication of propagating the timestamp to felix_rxtstamp()?

Answer: dsa_switch_ops :: port_rxtstamp() answers the question whether
PTP packets need sleepable context to retrieve the full RX timestamp.
Currently felix_rxtstamp() answers "no, thanks" to that question, and
calls ocelot_ptp_gettime64() from softirq atomic context. This is
understandable, since Felix VSC9959 is a PCIe memory-mapped switch, so
hardware access does not require sleeping. But the felix driver is
preparing for the introduction of other switches where hardware access
is over a slow bus like SPI or MDIO:
https://lore.kernel.org/lkml/20210814025003.2449143-1-colin.foster@in-advantage.com/

So I would like to keep this code structure, so the rework needed when
that driver will need PTP support will be minimal (answer "yes, I need
deferred context for this skb's RX timestamp", then the partial
timestamp will still be found in the skb->cb.

Fixes: ea440cd2d9b2 ("net: dsa: tag_ocelot: use VLAN information from tagging header when available")
Reported-by: Po Liu <po.liu@nxp.com>
Cc: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/ocelot.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
index d42010cf5468..7ee708ad7df2 100644
--- a/include/linux/dsa/ocelot.h
+++ b/include/linux/dsa/ocelot.h
@@ -12,6 +12,7 @@
 struct ocelot_skb_cb {
 	struct sk_buff *clone;
 	unsigned int ptp_class; /* valid only for clones */
+	u32 tstamp_lo;
 	u8 ptp_cmd;
 	u8 ts_id;
 };
-- 
cgit v1.2.3


From 6429e46304ac7820eebbea2bf5d73b90c18e0e06 Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Thu, 28 Oct 2021 10:47:21 +0100
Subject: libfs: Move shmem_exchange to simple_rename_exchange

Move shmem_exchange and make it available to other callers.

Suggested-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/bpf/20211028094724.59043-2-lmb@cloudflare.com
---
 include/linux/fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index f3cfca5edc9a..bc4e97b82ddd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3383,6 +3383,8 @@ extern int simple_open(struct inode *inode, struct file *file);
 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
+extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
+				  struct inode *new_dir, struct dentry *new_dentry);
 extern int simple_rename(struct user_namespace *, struct inode *,
 			 struct dentry *, struct inode *, struct dentry *,
 			 unsigned int);
-- 
cgit v1.2.3


From b18c1ad685d9a6af5189d4cc037a653270a07b4c Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 18 Oct 2021 15:17:25 +0300
Subject: i2c: Allow an ACPI driver to manage the device's power state during
 probe

Enable drivers to tell ACPI that there's no need to power on a device for
probe. Drivers should still perform this by themselves if there's a need
to. In some cases powering on the device during probe is undesirable, and
this change enables a driver to choose what fits best for it.

Add a field called "flags" into struct i2c_driver for driver flags, and a
flag I2C_DRV_ACPI_WAIVE_D0_PROBE to tell a driver supports probe in ACPI D
states other than 0.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Acked-by: Wolfram Sang <wsa@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/i2c.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 2ce3efbe9198..16119ac1aa97 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -11,6 +11,7 @@
 #define _LINUX_I2C_H
 
 #include <linux/acpi.h>		/* for acpi_handle */
+#include <linux/bits.h>
 #include <linux/mod_devicetable.h>
 #include <linux/device.h>	/* for struct device */
 #include <linux/sched.h>	/* for completion */
@@ -222,6 +223,15 @@ enum i2c_alert_protocol {
 	I2C_PROTOCOL_SMBUS_HOST_NOTIFY,
 };
 
+/**
+ * enum i2c_driver_flags - Flags for an I2C device driver
+ *
+ * @I2C_DRV_ACPI_WAIVE_D0_PROBE: Don't put the device in D0 state for probe
+ */
+enum i2c_driver_flags {
+	I2C_DRV_ACPI_WAIVE_D0_PROBE = BIT(0),
+};
+
 /**
  * struct i2c_driver - represent an I2C device driver
  * @class: What kind of i2c device we instantiate (for detect)
@@ -236,6 +246,7 @@ enum i2c_alert_protocol {
  * @detect: Callback for device detection
  * @address_list: The I2C addresses to probe (for detect)
  * @clients: List of detected clients we created (for i2c-core use only)
+ * @flags: A bitmask of flags defined in &enum i2c_driver_flags
  *
  * The driver.owner field should be set to the module owner of this driver.
  * The driver.name field should be set to the name of this driver.
@@ -294,6 +305,8 @@ struct i2c_driver {
 	int (*detect)(struct i2c_client *client, struct i2c_board_info *info);
 	const unsigned short *address_list;
 	struct list_head clients;
+
+	u32 flags;
 };
 #define to_i2c_driver(d) container_of(d, struct i2c_driver, driver)
 
@@ -1015,6 +1028,7 @@ u32 i2c_acpi_find_bus_speed(struct device *dev);
 struct i2c_client *i2c_acpi_new_device(struct device *dev, int index,
 				       struct i2c_board_info *info);
 struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle);
+bool i2c_acpi_waive_d0_probe(struct device *dev);
 #else
 static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 					     struct acpi_resource_i2c_serialbus **i2c)
@@ -1038,6 +1052,10 @@ static inline struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle ha
 {
 	return NULL;
 }
+static inline bool i2c_acpi_waive_d0_probe(struct device *dev)
+{
+	return false;
+}
 #endif /* CONFIG_ACPI */
 
 #endif /* _LINUX_I2C_H */
-- 
cgit v1.2.3


From b82a7df4a7f3841896aaec1ad81e654bc87b5989 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 18 Oct 2021 15:17:27 +0300
Subject: ACPI: Add a convenience function to tell a device is in D0 state

Add a convenience function to tell whether a device is in D0 state,
primarily for use in drivers' probe or remove functions on busses where
the custom is to power on the device for the duration of both.

Returns false on non-ACPI systems.

Suggested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index fbc2146050a4..df70fc27afd7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1016,6 +1016,7 @@ int acpi_subsys_runtime_suspend(struct device *dev);
 int acpi_subsys_runtime_resume(struct device *dev);
 int acpi_dev_pm_attach(struct device *dev, bool power_on);
 bool acpi_storage_d3(struct device *dev);
+bool acpi_dev_state_d0(struct device *dev);
 #else
 static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; }
 static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; }
@@ -1027,6 +1028,10 @@ static inline bool acpi_storage_d3(struct device *dev)
 {
 	return false;
 }
+static inline bool acpi_dev_state_d0(struct device *dev)
+{
+	return true;
+}
 #endif
 
 #if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP)
-- 
cgit v1.2.3


From 5c4e0a21fae877a7ef89be6dcc6263ec672372b8 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Tue, 2 Nov 2021 07:24:20 -0700
Subject: string: uninline memcpy_and_pad

When building m68k:allmodconfig, recent versions of gcc generate the
following error if the length of UTS_RELEASE is less than 8 bytes.

  In function 'memcpy_and_pad',
    inlined from 'nvmet_execute_disc_identify' at
      drivers/nvme/target/discovery.c:268:2: arch/m68k/include/asm/string.h:72:25: error:
	'__builtin_memcpy' reading 8 bytes from a region of size 7

Discussions around the problem suggest that this only happens if an
architecture does not provide strlen(), if -ffreestanding is provided as
compiler option, and if CONFIG_FORTIFY_SOURCE=n. All of this is the case
for m68k. The exact reasons are unknown, but seem to be related to the
ability of the compiler to evaluate the return value of strlen() and
the resulting execution flow in memcpy_and_pad(). It would be possible
to work around the problem by using sizeof(UTS_RELEASE) instead of
strlen(UTS_RELEASE), but that would only postpone the problem until the
function is called in a similar way. Uninline memcpy_and_pad() instead
to solve the problem for good.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index 5a36608144a9..b6572aeca2f5 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -253,23 +253,8 @@ static inline const char *kbasename(const char *path)
 #include <linux/fortify-string.h>
 #endif
 
-/**
- * memcpy_and_pad - Copy one buffer to another with padding
- * @dest: Where to copy to
- * @dest_len: The destination buffer size
- * @src: Where to copy from
- * @count: The number of bytes to copy
- * @pad: Character to use for padding if space is left in destination.
- */
-static inline void memcpy_and_pad(void *dest, size_t dest_len,
-				  const void *src, size_t count, int pad)
-{
-	if (dest_len > count) {
-		memcpy(dest, src, count);
-		memset(dest + count, pad,  dest_len - count);
-	} else
-		memcpy(dest, src, dest_len);
-}
+void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
+		    int pad);
 
 /**
  * memset_after - Set a value after a struct member to the end of a struct
-- 
cgit v1.2.3


From 00b06da29cf9dc633cdba87acd3f57f4df3fd5c7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 29 Oct 2021 09:14:19 -0500
Subject: signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed

As Andy pointed out that there are races between
force_sig_info_to_task and sigaction[1] when force_sig_info_task.  As
Kees discovered[2] ptrace is also able to change these signals.

In the case of seeccomp killing a process with a signal it is a
security violation to allow the signal to be caught or manipulated.

Solve this problem by introducing a new flag SA_IMMUTABLE that
prevents sigaction and ptrace from modifying these forced signals.
This flag is carefully made kernel internal so that no new ABI is
introduced.

Longer term I think this can be solved by guaranteeing short circuit
delivery of signals in this case.  Unfortunately reliable and
guaranteed short circuit delivery of these signals is still a ways off
from being implemented, tested, and merged.  So I have implemented a much
simpler alternative for now.

[1] https://lkml.kernel.org/r/b5d52d25-7bde-4030-a7b1-7c6f8ab90660@www.fastmail.com
[2] https://lkml.kernel.org/r/202110281136.5CE65399A7@keescook
Cc: stable@vger.kernel.org
Fixes: 307d522f5eb8 ("signal/seccomp: Refactor seccomp signal and coredump generation")
Tested-by: Andrea Righi <andrea.righi@canonical.com>
Tested-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/signal_types.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h
index 34cb28b8f16c..a70b2bdbf4d9 100644
--- a/include/linux/signal_types.h
+++ b/include/linux/signal_types.h
@@ -70,6 +70,9 @@ struct ksignal {
 	int sig;
 };
 
+/* Used to kill the race between sigaction and forced signals */
+#define SA_IMMUTABLE		0x00800000
+
 #ifndef __ARCH_UAPI_SA_FLAGS
 #ifdef SA_RESTORER
 #define __ARCH_UAPI_SA_FLAGS	SA_RESTORER
-- 
cgit v1.2.3


From 2116274af46b12df7cea1dc5698f3cf2f231f8a9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 4 Nov 2021 18:20:37 +0100
Subject: block: add a loff_t cast to bdev_nr_bytes

Not really needed as both loff_t and sector_t are always 64-bits wide,
but this documents the different types a bit better.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211104172037.531803-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 900325aa5d31..1dce769c4596 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -243,7 +243,7 @@ static inline sector_t bdev_nr_sectors(struct block_device *bdev)
 
 static inline loff_t bdev_nr_bytes(struct block_device *bdev)
 {
-	return bdev_nr_sectors(bdev) << SECTOR_SHIFT;
+	return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT;
 }
 
 static inline sector_t get_capacity(struct gendisk *disk)
-- 
cgit v1.2.3


From 0ab8d0f6ae3f1234e38eaedc3ff7c22bfdf7f78c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 29 Sep 2021 17:38:34 +0100
Subject: irqdomain: Make of_phandle_args_to_fwspec() generally available

of_phandle_args_to_fwspec() can be generally useful to code extracting a DT
of_phandle and using an irq_fwspec to use the hierarchical irqdomain API.

Make it visible to the rest of the kernel, including modules.

Link: https://lore.kernel.org/r/20210929163847.2807812-2-maz@kernel.org
Tested-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/irqdomain.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 23e4ee523576..cfd442316f39 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -64,6 +64,10 @@ struct irq_fwspec {
 	u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS];
 };
 
+/* Conversion function from of_phandle_args fields to fwspec  */
+void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+			       unsigned int count, struct irq_fwspec *fwspec);
+
 /*
  * Should several domains have the same device node, but serve
  * different purposes (for example one domain is for PCI/MSI, and the
-- 
cgit v1.2.3


From 439b08c57c3fe1df85cfe9d00accdf9b62cb3275 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 3 Nov 2021 16:51:36 +0100
Subject: Revert "usb: core: hcd: Add support for deferring roothub
 registration"

This reverts commit 58877b0824da15698bd85a0a9dbfa8c354e6ecb7.

It has been reported to be causing problems in Arch and Fedora bug
reports.

Reported-by: Hans de Goede <hdegoede@redhat.com>
Link: https://bbs.archlinux.org/viewtopic.php?pid=2000956#p2000956
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2019542
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2019576
Link: https://lore.kernel.org/r/42bcbea6-5eb8-16c7-336a-2cb72e71bc36@redhat.com
Cc: Mathias Nyman <mathias.nyman@linux.intel.com>
Cc: Chris Chiu <chris.chiu@canonical.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/hcd.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 2c1fc9212cf2..548a028f2dab 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -124,7 +124,6 @@ struct usb_hcd {
 #define HCD_FLAG_RH_RUNNING		5	/* root hub is running? */
 #define HCD_FLAG_DEAD			6	/* controller has died? */
 #define HCD_FLAG_INTF_AUTHORIZED	7	/* authorize interfaces? */
-#define HCD_FLAG_DEFER_RH_REGISTER	8	/* Defer roothub registration */
 
 	/* The flags can be tested using these macros; they are likely to
 	 * be slightly faster than test_bit().
@@ -135,7 +134,6 @@ struct usb_hcd {
 #define HCD_WAKEUP_PENDING(hcd)	((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING))
 #define HCD_RH_RUNNING(hcd)	((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING))
 #define HCD_DEAD(hcd)		((hcd)->flags & (1U << HCD_FLAG_DEAD))
-#define HCD_DEFER_RH_REGISTER(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEFER_RH_REGISTER))
 
 	/*
 	 * Specifies if interfaces are authorized by default
-- 
cgit v1.2.3


From 27d9a4d69433af1827a764fe235866d5d5501fdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Thu, 9 Sep 2021 11:48:48 +0200
Subject: pwm: Add might_sleep() annotations for !CONFIG_PWM API functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The normal implementations of these functions make use of mutexes. To
make it obvious that these functions might sleep also add annotations to
the dummy implementations in the !CONFIG_PWM case.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 725c9b784e60..515e33978e97 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -429,11 +429,13 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
 #else
 static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
 {
+	might_sleep();
 	return ERR_PTR(-ENODEV);
 }
 
 static inline void pwm_free(struct pwm_device *pwm)
 {
+	might_sleep();
 }
 
 static inline int pwm_apply_state(struct pwm_device *pwm,
@@ -493,12 +495,14 @@ static inline struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
 						       unsigned int index,
 						       const char *label)
 {
+	might_sleep();
 	return ERR_PTR(-ENODEV);
 }
 
 static inline struct pwm_device *pwm_get(struct device *dev,
 					 const char *consumer)
 {
+	might_sleep();
 	return ERR_PTR(-ENODEV);
 }
 
@@ -506,16 +510,19 @@ static inline struct pwm_device *of_pwm_get(struct device *dev,
 					    struct device_node *np,
 					    const char *con_id)
 {
+	might_sleep();
 	return ERR_PTR(-ENODEV);
 }
 
 static inline void pwm_put(struct pwm_device *pwm)
 {
+	might_sleep();
 }
 
 static inline struct pwm_device *devm_pwm_get(struct device *dev,
 					      const char *consumer)
 {
+	might_sleep();
 	return ERR_PTR(-ENODEV);
 }
 
@@ -523,6 +530,7 @@ static inline struct pwm_device *devm_of_pwm_get(struct device *dev,
 						 struct device_node *np,
 						 const char *con_id)
 {
+	might_sleep();
 	return ERR_PTR(-ENODEV);
 }
 
@@ -530,6 +538,7 @@ static inline struct pwm_device *
 devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode,
 		    const char *con_id)
 {
+	might_sleep();
 	return ERR_PTR(-ENODEV);
 }
 #endif
-- 
cgit v1.2.3


From 4ad91a227817ae48f931595d1101fc7100073ce9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Thu, 9 Sep 2021 11:48:49 +0200
Subject: pwm: Make it explicit that pwm_apply_state() might sleep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At least some implementations sleep. So mark pwm_apply_state() with a
might_sleep() to make callers aware. In the worst case this uncovers a
valid atomic user, then we revert this patch and at least gained some more
knowledge and then can work on a concept similar to
gpio_get_value/gpio_get_value_cansleep.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 515e33978e97..e6dac95e4960 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -441,6 +441,7 @@ static inline void pwm_free(struct pwm_device *pwm)
 static inline int pwm_apply_state(struct pwm_device *pwm,
 				  const struct pwm_state *state)
 {
+	might_sleep();
 	return -ENOTSUPP;
 }
 
@@ -452,6 +453,7 @@ static inline int pwm_adjust_config(struct pwm_device *pwm)
 static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
 			     int period_ns)
 {
+	might_sleep();
 	return -EINVAL;
 }
 
@@ -464,11 +466,13 @@ static inline int pwm_capture(struct pwm_device *pwm,
 
 static inline int pwm_enable(struct pwm_device *pwm)
 {
+	might_sleep();
 	return -EINVAL;
 }
 
 static inline void pwm_disable(struct pwm_device *pwm)
 {
+	might_sleep();
 }
 
 static inline int pwm_set_chip_data(struct pwm_device *pwm, void *data)
-- 
cgit v1.2.3


From c9a20383578abd8f7fb8ba88f4c6d25b47924c34 Mon Sep 17 00:00:00 2001
From: Carlos de Paula <me@carlosedp.com>
Date: Mon, 30 Aug 2021 16:53:45 -0300
Subject: mfd: da9063: Add support for latest EA silicon revision

This update adds new regmap to support the latest EA silicon
which will be selected based on the chip and variant
information read from the device.

Signed-off-by: Carlos de Paula <me@carlosedp.com>
Reviewed-by: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/da9063/core.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/da9063/core.h b/include/linux/mfd/da9063/core.h
index fa7a43f02f27..8db52324f416 100644
--- a/include/linux/mfd/da9063/core.h
+++ b/include/linux/mfd/da9063/core.h
@@ -36,6 +36,7 @@ enum da9063_variant_codes {
 	PMIC_DA9063_BB = 0x5,
 	PMIC_DA9063_CA = 0x6,
 	PMIC_DA9063_DA = 0x7,
+	PMIC_DA9063_EA = 0x8,
 };
 
 /* Interrupts */
-- 
cgit v1.2.3


From ec14d90dee8ec6960324ae9f1116103efcde8a52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Tue, 12 Oct 2021 17:39:35 +0200
Subject: mfd: tps65912: Make tps65912_device_exit() return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Up to now tps65912_device_exit() returns zero unconditionally. Make it
return void instead which makes it easier to see in the callers that
there is no error to handle.

Also the return value of i2c and spi remove callbacks is ignored anyway.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211012153945.2651412-11-u.kleine-koenig@pengutronix.de
---
 include/linux/mfd/tps65912.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/tps65912.h b/include/linux/mfd/tps65912.h
index 7943e413deae..8a61386cb8c1 100644
--- a/include/linux/mfd/tps65912.h
+++ b/include/linux/mfd/tps65912.h
@@ -322,6 +322,6 @@ struct tps65912 {
 extern const struct regmap_config tps65912_regmap_config;
 
 int tps65912_device_init(struct tps65912 *tps);
-int tps65912_device_exit(struct tps65912 *tps);
+void tps65912_device_exit(struct tps65912 *tps);
 
 #endif /*  __LINUX_MFD_TPS65912_H */
-- 
cgit v1.2.3


From 0cee0416563d7cac807c8f092941f3e37ede05db Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca@lucaceresoli.net>
Date: Tue, 19 Oct 2021 16:59:11 +0200
Subject: mfd: max77686: Correct tab-based alignment of register addresses

Some lines have an extra tab, remove them for proper visual alignment as
present on the rest of this file.

Signed-off-by: Luca Ceresoli <luca@lucaceresoli.net>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211019145919.7327-2-luca@lucaceresoli.net
---
 include/linux/mfd/max77686-private.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max77686-private.h b/include/linux/mfd/max77686-private.h
index 833e578e051e..b1482b3cf353 100644
--- a/include/linux/mfd/max77686-private.h
+++ b/include/linux/mfd/max77686-private.h
@@ -133,35 +133,35 @@ enum max77686_pmic_reg {
 	/* Reserved: 0x7A-0x7D */
 
 	MAX77686_REG_BBAT_CHG		= 0x7E,
-	MAX77686_REG_32KHZ			= 0x7F,
+	MAX77686_REG_32KHZ		= 0x7F,
 
 	MAX77686_REG_PMIC_END		= 0x80,
 };
 
 enum max77686_rtc_reg {
-	MAX77686_RTC_INT			= 0x00,
-	MAX77686_RTC_INTM			= 0x01,
+	MAX77686_RTC_INT		= 0x00,
+	MAX77686_RTC_INTM		= 0x01,
 	MAX77686_RTC_CONTROLM		= 0x02,
 	MAX77686_RTC_CONTROL		= 0x03,
 	MAX77686_RTC_UPDATE0		= 0x04,
 	/* Reserved: 0x5 */
 	MAX77686_WTSR_SMPL_CNTL		= 0x06,
-	MAX77686_RTC_SEC			= 0x07,
-	MAX77686_RTC_MIN			= 0x08,
-	MAX77686_RTC_HOUR			= 0x09,
+	MAX77686_RTC_SEC		= 0x07,
+	MAX77686_RTC_MIN		= 0x08,
+	MAX77686_RTC_HOUR		= 0x09,
 	MAX77686_RTC_WEEKDAY		= 0x0A,
-	MAX77686_RTC_MONTH			= 0x0B,
-	MAX77686_RTC_YEAR			= 0x0C,
-	MAX77686_RTC_DATE			= 0x0D,
-	MAX77686_ALARM1_SEC			= 0x0E,
-	MAX77686_ALARM1_MIN			= 0x0F,
+	MAX77686_RTC_MONTH		= 0x0B,
+	MAX77686_RTC_YEAR		= 0x0C,
+	MAX77686_RTC_DATE		= 0x0D,
+	MAX77686_ALARM1_SEC		= 0x0E,
+	MAX77686_ALARM1_MIN		= 0x0F,
 	MAX77686_ALARM1_HOUR		= 0x10,
 	MAX77686_ALARM1_WEEKDAY		= 0x11,
 	MAX77686_ALARM1_MONTH		= 0x12,
 	MAX77686_ALARM1_YEAR		= 0x13,
 	MAX77686_ALARM1_DATE		= 0x14,
-	MAX77686_ALARM2_SEC			= 0x15,
-	MAX77686_ALARM2_MIN			= 0x16,
+	MAX77686_ALARM2_SEC		= 0x15,
+	MAX77686_ALARM2_MIN		= 0x16,
 	MAX77686_ALARM2_HOUR		= 0x17,
 	MAX77686_ALARM2_WEEKDAY		= 0x18,
 	MAX77686_ALARM2_MONTH		= 0x19,
-- 
cgit v1.2.3


From b20cd02f7fef68ae395d9df0a9fb9edcf414b5a2 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Thu, 21 Oct 2021 22:22:58 +0300
Subject: mfd: tps80031: Remove driver

Driver was upstreamed in 2013 and never got a user, remove it.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211021192258.21968-4-digetx@gmail.com
---
 include/linux/mfd/tps80031.h | 637 -------------------------------------------
 1 file changed, 637 deletions(-)
 delete mode 100644 include/linux/mfd/tps80031.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/tps80031.h b/include/linux/mfd/tps80031.h
deleted file mode 100644
index 2c75c9c9318f..000000000000
--- a/include/linux/mfd/tps80031.h
+++ /dev/null
@@ -1,637 +0,0 @@
-/*
- * tps80031.h -- TI TPS80031 and TI TPS80032 PMIC driver.
- *
- * Copyright (c) 2012, NVIDIA Corporation.
- *
- * Author: Laxman Dewangan <ldewangan@nvidia.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any kind,
- * whether express or implied; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- * 02111-1307, USA
- */
-
-#ifndef __LINUX_MFD_TPS80031_H
-#define __LINUX_MFD_TPS80031_H
-
-#include <linux/device.h>
-#include <linux/regmap.h>
-
-/* Pull-ups/Pull-downs */
-#define TPS80031_CFG_INPUT_PUPD1			0xF0
-#define TPS80031_CFG_INPUT_PUPD2			0xF1
-#define TPS80031_CFG_INPUT_PUPD3			0xF2
-#define TPS80031_CFG_INPUT_PUPD4			0xF3
-#define TPS80031_CFG_LDO_PD1				0xF4
-#define TPS80031_CFG_LDO_PD2				0xF5
-#define TPS80031_CFG_SMPS_PD				0xF6
-
-/* Real Time Clock */
-#define TPS80031_SECONDS_REG				0x00
-#define TPS80031_MINUTES_REG				0x01
-#define TPS80031_HOURS_REG				0x02
-#define TPS80031_DAYS_REG				0x03
-#define TPS80031_MONTHS_REG				0x04
-#define TPS80031_YEARS_REG				0x05
-#define TPS80031_WEEKS_REG				0x06
-#define TPS80031_ALARM_SECONDS_REG			0x08
-#define TPS80031_ALARM_MINUTES_REG			0x09
-#define TPS80031_ALARM_HOURS_REG			0x0A
-#define TPS80031_ALARM_DAYS_REG				0x0B
-#define TPS80031_ALARM_MONTHS_REG			0x0C
-#define TPS80031_ALARM_YEARS_REG			0x0D
-#define TPS80031_RTC_CTRL_REG				0x10
-#define TPS80031_RTC_STATUS_REG				0x11
-#define TPS80031_RTC_INTERRUPTS_REG			0x12
-#define TPS80031_RTC_COMP_LSB_REG			0x13
-#define TPS80031_RTC_COMP_MSB_REG			0x14
-#define TPS80031_RTC_RESET_STATUS_REG			0x16
-
-/*PMC Master Module */
-#define TPS80031_PHOENIX_START_CONDITION		0x1F
-#define TPS80031_PHOENIX_MSK_TRANSITION			0x20
-#define TPS80031_STS_HW_CONDITIONS			0x21
-#define TPS80031_PHOENIX_LAST_TURNOFF_STS		0x22
-#define TPS80031_VSYSMIN_LO_THRESHOLD			0x23
-#define TPS80031_VSYSMIN_HI_THRESHOLD			0x24
-#define TPS80031_PHOENIX_DEV_ON				0x25
-#define TPS80031_STS_PWR_GRP_STATE			0x27
-#define TPS80031_PH_CFG_VSYSLOW				0x28
-#define TPS80031_PH_STS_BOOT				0x29
-#define TPS80031_PHOENIX_SENS_TRANSITION		0x2A
-#define TPS80031_PHOENIX_SEQ_CFG			0x2B
-#define TPS80031_PRIMARY_WATCHDOG_CFG			0X2C
-#define TPS80031_KEY_PRESS_DUR_CFG			0X2D
-#define TPS80031_SMPS_LDO_SHORT_STS			0x2E
-
-/* PMC Slave Module - Broadcast */
-#define TPS80031_BROADCAST_ADDR_ALL			0x31
-#define TPS80031_BROADCAST_ADDR_REF			0x32
-#define TPS80031_BROADCAST_ADDR_PROV			0x33
-#define TPS80031_BROADCAST_ADDR_CLK_RST			0x34
-
-/* PMC Slave Module  SMPS Regulators */
-#define TPS80031_SMPS4_CFG_TRANS			0x41
-#define TPS80031_SMPS4_CFG_STATE			0x42
-#define TPS80031_SMPS4_CFG_VOLTAGE			0x44
-#define TPS80031_VIO_CFG_TRANS				0x47
-#define TPS80031_VIO_CFG_STATE				0x48
-#define TPS80031_VIO_CFG_FORCE				0x49
-#define TPS80031_VIO_CFG_VOLTAGE			0x4A
-#define TPS80031_VIO_CFG_STEP				0x48
-#define TPS80031_SMPS1_CFG_TRANS			0x53
-#define TPS80031_SMPS1_CFG_STATE			0x54
-#define TPS80031_SMPS1_CFG_FORCE			0x55
-#define TPS80031_SMPS1_CFG_VOLTAGE			0x56
-#define TPS80031_SMPS1_CFG_STEP				0x57
-#define TPS80031_SMPS2_CFG_TRANS			0x59
-#define TPS80031_SMPS2_CFG_STATE			0x5A
-#define TPS80031_SMPS2_CFG_FORCE			0x5B
-#define TPS80031_SMPS2_CFG_VOLTAGE			0x5C
-#define TPS80031_SMPS2_CFG_STEP				0x5D
-#define TPS80031_SMPS3_CFG_TRANS			0x65
-#define TPS80031_SMPS3_CFG_STATE			0x66
-#define TPS80031_SMPS3_CFG_VOLTAGE			0x68
-
-/* PMC Slave Module  LDO Regulators */
-#define TPS80031_VANA_CFG_TRANS				0x81
-#define TPS80031_VANA_CFG_STATE				0x82
-#define TPS80031_VANA_CFG_VOLTAGE			0x83
-#define TPS80031_LDO2_CFG_TRANS				0x85
-#define TPS80031_LDO2_CFG_STATE				0x86
-#define TPS80031_LDO2_CFG_VOLTAGE			0x87
-#define TPS80031_LDO4_CFG_TRANS				0x89
-#define TPS80031_LDO4_CFG_STATE				0x8A
-#define TPS80031_LDO4_CFG_VOLTAGE			0x8B
-#define TPS80031_LDO3_CFG_TRANS				0x8D
-#define TPS80031_LDO3_CFG_STATE				0x8E
-#define TPS80031_LDO3_CFG_VOLTAGE			0x8F
-#define TPS80031_LDO6_CFG_TRANS				0x91
-#define TPS80031_LDO6_CFG_STATE				0x92
-#define TPS80031_LDO6_CFG_VOLTAGE			0x93
-#define TPS80031_LDOLN_CFG_TRANS			0x95
-#define TPS80031_LDOLN_CFG_STATE			0x96
-#define TPS80031_LDOLN_CFG_VOLTAGE			0x97
-#define TPS80031_LDO5_CFG_TRANS				0x99
-#define TPS80031_LDO5_CFG_STATE				0x9A
-#define TPS80031_LDO5_CFG_VOLTAGE			0x9B
-#define TPS80031_LDO1_CFG_TRANS				0x9D
-#define TPS80031_LDO1_CFG_STATE				0x9E
-#define TPS80031_LDO1_CFG_VOLTAGE			0x9F
-#define TPS80031_LDOUSB_CFG_TRANS			0xA1
-#define TPS80031_LDOUSB_CFG_STATE			0xA2
-#define TPS80031_LDOUSB_CFG_VOLTAGE			0xA3
-#define TPS80031_LDO7_CFG_TRANS				0xA5
-#define TPS80031_LDO7_CFG_STATE				0xA6
-#define TPS80031_LDO7_CFG_VOLTAGE			0xA7
-
-/* PMC Slave Module  External Control */
-#define TPS80031_REGEN1_CFG_TRANS			0xAE
-#define TPS80031_REGEN1_CFG_STATE			0xAF
-#define TPS80031_REGEN2_CFG_TRANS			0xB1
-#define TPS80031_REGEN2_CFG_STATE			0xB2
-#define TPS80031_SYSEN_CFG_TRANS			0xB4
-#define TPS80031_SYSEN_CFG_STATE			0xB5
-
-/* PMC Slave Module  Internal Control */
-#define TPS80031_NRESPWRON_CFG_TRANS			0xB7
-#define TPS80031_NRESPWRON_CFG_STATE			0xB8
-#define TPS80031_CLK32KAO_CFG_TRANS			0xBA
-#define TPS80031_CLK32KAO_CFG_STATE			0xBB
-#define TPS80031_CLK32KG_CFG_TRANS			0xBD
-#define TPS80031_CLK32KG_CFG_STATE			0xBE
-#define TPS80031_CLK32KAUDIO_CFG_TRANS			0xC0
-#define TPS80031_CLK32KAUDIO_CFG_STATE			0xC1
-#define TPS80031_VRTC_CFG_TRANS				0xC3
-#define TPS80031_VRTC_CFG_STATE				0xC4
-#define TPS80031_BIAS_CFG_TRANS				0xC6
-#define TPS80031_BIAS_CFG_STATE				0xC7
-#define TPS80031_VSYSMIN_HI_CFG_TRANS			0xC9
-#define TPS80031_VSYSMIN_HI_CFG_STATE			0xCA
-#define TPS80031_RC6MHZ_CFG_TRANS			0xCC
-#define TPS80031_RC6MHZ_CFG_STATE			0xCD
-#define TPS80031_TMP_CFG_TRANS				0xCF
-#define TPS80031_TMP_CFG_STATE				0xD0
-
-/* PMC Slave Module  resources assignment */
-#define TPS80031_PREQ1_RES_ASS_A			0xD7
-#define TPS80031_PREQ1_RES_ASS_B			0xD8
-#define TPS80031_PREQ1_RES_ASS_C			0xD9
-#define TPS80031_PREQ2_RES_ASS_A			0xDA
-#define TPS80031_PREQ2_RES_ASS_B			0xDB
-#define TPS80031_PREQ2_RES_ASS_C			0xDC
-#define TPS80031_PREQ3_RES_ASS_A			0xDD
-#define TPS80031_PREQ3_RES_ASS_B			0xDE
-#define TPS80031_PREQ3_RES_ASS_C			0xDF
-
-/* PMC Slave Module  Miscellaneous */
-#define TPS80031_SMPS_OFFSET				0xE0
-#define TPS80031_SMPS_MULT				0xE3
-#define TPS80031_MISC1					0xE4
-#define TPS80031_MISC2					0xE5
-#define TPS80031_BBSPOR_CFG				0xE6
-#define TPS80031_TMP_CFG				0xE7
-
-/* Battery Charging Controller and Indicator LED */
-#define TPS80031_CONTROLLER_CTRL2			0xDA
-#define TPS80031_CONTROLLER_VSEL_COMP			0xDB
-#define TPS80031_CHARGERUSB_VSYSREG			0xDC
-#define TPS80031_CHARGERUSB_VICHRG_PC			0xDD
-#define TPS80031_LINEAR_CHRG_STS			0xDE
-#define TPS80031_CONTROLLER_INT_MASK			0xE0
-#define TPS80031_CONTROLLER_CTRL1			0xE1
-#define TPS80031_CONTROLLER_WDG				0xE2
-#define TPS80031_CONTROLLER_STAT1			0xE3
-#define TPS80031_CHARGERUSB_INT_STATUS			0xE4
-#define TPS80031_CHARGERUSB_INT_MASK			0xE5
-#define TPS80031_CHARGERUSB_STATUS_INT1			0xE6
-#define TPS80031_CHARGERUSB_STATUS_INT2			0xE7
-#define TPS80031_CHARGERUSB_CTRL1			0xE8
-#define TPS80031_CHARGERUSB_CTRL2			0xE9
-#define TPS80031_CHARGERUSB_CTRL3			0xEA
-#define TPS80031_CHARGERUSB_STAT1			0xEB
-#define TPS80031_CHARGERUSB_VOREG			0xEC
-#define TPS80031_CHARGERUSB_VICHRG			0xED
-#define TPS80031_CHARGERUSB_CINLIMIT			0xEE
-#define TPS80031_CHARGERUSB_CTRLLIMIT1			0xEF
-#define TPS80031_CHARGERUSB_CTRLLIMIT2			0xF0
-#define TPS80031_LED_PWM_CTRL1				0xF4
-#define TPS80031_LED_PWM_CTRL2				0xF5
-
-/* USB On-The-Go  */
-#define TPS80031_BACKUP_REG				0xFA
-#define TPS80031_USB_VENDOR_ID_LSB			0x00
-#define TPS80031_USB_VENDOR_ID_MSB			0x01
-#define TPS80031_USB_PRODUCT_ID_LSB			0x02
-#define TPS80031_USB_PRODUCT_ID_MSB			0x03
-#define TPS80031_USB_VBUS_CTRL_SET			0x04
-#define TPS80031_USB_VBUS_CTRL_CLR			0x05
-#define TPS80031_USB_ID_CTRL_SET			0x06
-#define TPS80031_USB_ID_CTRL_CLR			0x07
-#define TPS80031_USB_VBUS_INT_SRC			0x08
-#define TPS80031_USB_VBUS_INT_LATCH_SET			0x09
-#define TPS80031_USB_VBUS_INT_LATCH_CLR			0x0A
-#define TPS80031_USB_VBUS_INT_EN_LO_SET			0x0B
-#define TPS80031_USB_VBUS_INT_EN_LO_CLR			0x0C
-#define TPS80031_USB_VBUS_INT_EN_HI_SET			0x0D
-#define TPS80031_USB_VBUS_INT_EN_HI_CLR			0x0E
-#define TPS80031_USB_ID_INT_SRC				0x0F
-#define TPS80031_USB_ID_INT_LATCH_SET			0x10
-#define TPS80031_USB_ID_INT_LATCH_CLR			0x11
-#define TPS80031_USB_ID_INT_EN_LO_SET			0x12
-#define TPS80031_USB_ID_INT_EN_LO_CLR			0x13
-#define TPS80031_USB_ID_INT_EN_HI_SET			0x14
-#define TPS80031_USB_ID_INT_EN_HI_CLR			0x15
-#define TPS80031_USB_OTG_ADP_CTRL			0x16
-#define TPS80031_USB_OTG_ADP_HIGH			0x17
-#define TPS80031_USB_OTG_ADP_LOW			0x18
-#define TPS80031_USB_OTG_ADP_RISE			0x19
-#define TPS80031_USB_OTG_REVISION			0x1A
-
-/* Gas Gauge */
-#define TPS80031_FG_REG_00				0xC0
-#define TPS80031_FG_REG_01				0xC1
-#define TPS80031_FG_REG_02				0xC2
-#define TPS80031_FG_REG_03				0xC3
-#define TPS80031_FG_REG_04				0xC4
-#define TPS80031_FG_REG_05				0xC5
-#define TPS80031_FG_REG_06				0xC6
-#define TPS80031_FG_REG_07				0xC7
-#define TPS80031_FG_REG_08				0xC8
-#define TPS80031_FG_REG_09				0xC9
-#define TPS80031_FG_REG_10				0xCA
-#define TPS80031_FG_REG_11				0xCB
-
-/* General Purpose ADC */
-#define TPS80031_GPADC_CTRL				0x2E
-#define TPS80031_GPADC_CTRL2				0x2F
-#define TPS80031_RTSELECT_LSB				0x32
-#define TPS80031_RTSELECT_ISB				0x33
-#define TPS80031_RTSELECT_MSB				0x34
-#define TPS80031_GPSELECT_ISB				0x35
-#define TPS80031_CTRL_P1				0x36
-#define TPS80031_RTCH0_LSB				0x37
-#define TPS80031_RTCH0_MSB				0x38
-#define TPS80031_RTCH1_LSB				0x39
-#define TPS80031_RTCH1_MSB				0x3A
-#define TPS80031_GPCH0_LSB				0x3B
-#define TPS80031_GPCH0_MSB				0x3C
-
-/* SIM, MMC and Battery Detection */
-#define TPS80031_SIMDEBOUNCING				0xEB
-#define TPS80031_SIMCTRL				0xEC
-#define TPS80031_MMCDEBOUNCING				0xED
-#define TPS80031_MMCCTRL				0xEE
-#define TPS80031_BATDEBOUNCING				0xEF
-
-/* Vibrator Driver and PWMs */
-#define TPS80031_VIBCTRL				0x9B
-#define TPS80031_VIBMODE				0x9C
-#define TPS80031_PWM1ON					0xBA
-#define TPS80031_PWM1OFF				0xBB
-#define TPS80031_PWM2ON					0xBD
-#define TPS80031_PWM2OFF				0xBE
-
-/* Control Interface */
-#define TPS80031_INT_STS_A				0xD0
-#define TPS80031_INT_STS_B				0xD1
-#define TPS80031_INT_STS_C				0xD2
-#define TPS80031_INT_MSK_LINE_A				0xD3
-#define TPS80031_INT_MSK_LINE_B				0xD4
-#define TPS80031_INT_MSK_LINE_C				0xD5
-#define TPS80031_INT_MSK_STS_A				0xD6
-#define TPS80031_INT_MSK_STS_B				0xD7
-#define TPS80031_INT_MSK_STS_C				0xD8
-#define TPS80031_TOGGLE1				0x90
-#define TPS80031_TOGGLE2				0x91
-#define TPS80031_TOGGLE3				0x92
-#define TPS80031_PWDNSTATUS1				0x93
-#define TPS80031_PWDNSTATUS2				0x94
-#define TPS80031_VALIDITY0				0x17
-#define TPS80031_VALIDITY1				0x18
-#define TPS80031_VALIDITY2				0x19
-#define TPS80031_VALIDITY3				0x1A
-#define TPS80031_VALIDITY4				0x1B
-#define TPS80031_VALIDITY5				0x1C
-#define TPS80031_VALIDITY6				0x1D
-#define TPS80031_VALIDITY7				0x1E
-
-/* Version number related register */
-#define TPS80031_JTAGVERNUM				0x87
-#define TPS80031_EPROM_REV				0xDF
-
-/* GPADC Trimming Bits. */
-#define TPS80031_GPADC_TRIM0				0xCC
-#define TPS80031_GPADC_TRIM1				0xCD
-#define TPS80031_GPADC_TRIM2				0xCE
-#define TPS80031_GPADC_TRIM3				0xCF
-#define TPS80031_GPADC_TRIM4				0xD0
-#define TPS80031_GPADC_TRIM5				0xD1
-#define TPS80031_GPADC_TRIM6				0xD2
-#define TPS80031_GPADC_TRIM7				0xD3
-#define TPS80031_GPADC_TRIM8				0xD4
-#define TPS80031_GPADC_TRIM9				0xD5
-#define TPS80031_GPADC_TRIM10				0xD6
-#define TPS80031_GPADC_TRIM11				0xD7
-#define TPS80031_GPADC_TRIM12				0xD8
-#define TPS80031_GPADC_TRIM13				0xD9
-#define TPS80031_GPADC_TRIM14				0xDA
-#define TPS80031_GPADC_TRIM15				0xDB
-#define TPS80031_GPADC_TRIM16				0xDC
-#define TPS80031_GPADC_TRIM17				0xDD
-#define TPS80031_GPADC_TRIM18				0xDE
-
-/* TPS80031_CONTROLLER_STAT1 bit fields */
-#define TPS80031_CONTROLLER_STAT1_BAT_TEMP		0
-#define TPS80031_CONTROLLER_STAT1_BAT_REMOVED		1
-#define TPS80031_CONTROLLER_STAT1_VBUS_DET		2
-#define TPS80031_CONTROLLER_STAT1_VAC_DET		3
-#define TPS80031_CONTROLLER_STAT1_FAULT_WDG		4
-#define TPS80031_CONTROLLER_STAT1_LINCH_GATED		6
-/* TPS80031_CONTROLLER_INT_MASK bit filed */
-#define TPS80031_CONTROLLER_INT_MASK_MVAC_DET		0
-#define TPS80031_CONTROLLER_INT_MASK_MVBUS_DET		1
-#define TPS80031_CONTROLLER_INT_MASK_MBAT_TEMP		2
-#define TPS80031_CONTROLLER_INT_MASK_MFAULT_WDG		3
-#define TPS80031_CONTROLLER_INT_MASK_MBAT_REMOVED	4
-#define TPS80031_CONTROLLER_INT_MASK_MLINCH_GATED	5
-
-#define TPS80031_CHARGE_CONTROL_SUB_INT_MASK		0x3F
-
-/* TPS80031_PHOENIX_DEV_ON bit field */
-#define TPS80031_DEVOFF					0x1
-
-#define TPS80031_EXT_CONTROL_CFG_TRANS			0
-#define TPS80031_EXT_CONTROL_CFG_STATE			1
-
-/* State register field */
-#define TPS80031_STATE_OFF				0x00
-#define TPS80031_STATE_ON				0x01
-#define TPS80031_STATE_MASK				0x03
-
-/* Trans register field */
-#define TPS80031_TRANS_ACTIVE_OFF			0x00
-#define TPS80031_TRANS_ACTIVE_ON			0x01
-#define TPS80031_TRANS_ACTIVE_MASK			0x03
-#define TPS80031_TRANS_SLEEP_OFF			0x00
-#define TPS80031_TRANS_SLEEP_ON				0x04
-#define TPS80031_TRANS_SLEEP_MASK			0x0C
-#define TPS80031_TRANS_OFF_OFF				0x00
-#define TPS80031_TRANS_OFF_ACTIVE			0x10
-#define TPS80031_TRANS_OFF_MASK				0x30
-
-#define TPS80031_EXT_PWR_REQ		(TPS80031_PWR_REQ_INPUT_PREQ1 | \
-					TPS80031_PWR_REQ_INPUT_PREQ2 | \
-					TPS80031_PWR_REQ_INPUT_PREQ3)
-
-/* TPS80031_BBSPOR_CFG bit field */
-#define TPS80031_BBSPOR_CHG_EN				0x8
-#define TPS80031_MAX_REGISTER				0xFF
-
-struct i2c_client;
-
-/* Supported chips */
-enum chips {
-	TPS80031 = 0x00000001,
-	TPS80032 = 0x00000002,
-};
-
-enum {
-	TPS80031_INT_PWRON,
-	TPS80031_INT_RPWRON,
-	TPS80031_INT_SYS_VLOW,
-	TPS80031_INT_RTC_ALARM,
-	TPS80031_INT_RTC_PERIOD,
-	TPS80031_INT_HOT_DIE,
-	TPS80031_INT_VXX_SHORT,
-	TPS80031_INT_SPDURATION,
-	TPS80031_INT_WATCHDOG,
-	TPS80031_INT_BAT,
-	TPS80031_INT_SIM,
-	TPS80031_INT_MMC,
-	TPS80031_INT_RES,
-	TPS80031_INT_GPADC_RT,
-	TPS80031_INT_GPADC_SW2_EOC,
-	TPS80031_INT_CC_AUTOCAL,
-	TPS80031_INT_ID_WKUP,
-	TPS80031_INT_VBUSS_WKUP,
-	TPS80031_INT_ID,
-	TPS80031_INT_VBUS,
-	TPS80031_INT_CHRG_CTRL,
-	TPS80031_INT_EXT_CHRG,
-	TPS80031_INT_INT_CHRG,
-	TPS80031_INT_RES2,
-	TPS80031_INT_BAT_TEMP_OVRANGE,
-	TPS80031_INT_BAT_REMOVED,
-	TPS80031_INT_VBUS_DET,
-	TPS80031_INT_VAC_DET,
-	TPS80031_INT_FAULT_WDG,
-	TPS80031_INT_LINCH_GATED,
-
-	/* Last interrupt id to get the end number */
-	TPS80031_INT_NR,
-};
-
-/* TPS80031 Slave IDs */
-#define TPS80031_NUM_SLAVES				4
-#define TPS80031_SLAVE_ID0				0
-#define TPS80031_SLAVE_ID1				1
-#define TPS80031_SLAVE_ID2				2
-#define TPS80031_SLAVE_ID3				3
-
-/* TPS80031 I2C addresses */
-#define TPS80031_I2C_ID0_ADDR				0x12
-#define TPS80031_I2C_ID1_ADDR				0x48
-#define TPS80031_I2C_ID2_ADDR				0x49
-#define TPS80031_I2C_ID3_ADDR				0x4A
-
-enum {
-	TPS80031_REGULATOR_VIO,
-	TPS80031_REGULATOR_SMPS1,
-	TPS80031_REGULATOR_SMPS2,
-	TPS80031_REGULATOR_SMPS3,
-	TPS80031_REGULATOR_SMPS4,
-	TPS80031_REGULATOR_VANA,
-	TPS80031_REGULATOR_LDO1,
-	TPS80031_REGULATOR_LDO2,
-	TPS80031_REGULATOR_LDO3,
-	TPS80031_REGULATOR_LDO4,
-	TPS80031_REGULATOR_LDO5,
-	TPS80031_REGULATOR_LDO6,
-	TPS80031_REGULATOR_LDO7,
-	TPS80031_REGULATOR_LDOLN,
-	TPS80031_REGULATOR_LDOUSB,
-	TPS80031_REGULATOR_VBUS,
-	TPS80031_REGULATOR_REGEN1,
-	TPS80031_REGULATOR_REGEN2,
-	TPS80031_REGULATOR_SYSEN,
-	TPS80031_REGULATOR_MAX,
-};
-
-/* Different configurations for the rails */
-enum {
-	/* USBLDO input selection */
-	TPS80031_USBLDO_INPUT_VSYS		= 0x00000001,
-	TPS80031_USBLDO_INPUT_PMID		= 0x00000002,
-
-	/* LDO3 output mode */
-	TPS80031_LDO3_OUTPUT_VIB		= 0x00000004,
-
-	/* VBUS configuration */
-	TPS80031_VBUS_DISCHRG_EN_PDN		= 0x00000004,
-	TPS80031_VBUS_SW_ONLY			= 0x00000008,
-	TPS80031_VBUS_SW_N_ID			= 0x00000010,
-};
-
-/* External controls requests */
-enum tps80031_ext_control {
-	TPS80031_PWR_REQ_INPUT_NONE		= 0x00000000,
-	TPS80031_PWR_REQ_INPUT_PREQ1		= 0x00000001,
-	TPS80031_PWR_REQ_INPUT_PREQ2		= 0x00000002,
-	TPS80031_PWR_REQ_INPUT_PREQ3		= 0x00000004,
-	TPS80031_PWR_OFF_ON_SLEEP		= 0x00000008,
-	TPS80031_PWR_ON_ON_SLEEP		= 0x00000010,
-};
-
-enum tps80031_pupd_pins {
-	TPS80031_PREQ1 = 0,
-	TPS80031_PREQ2A,
-	TPS80031_PREQ2B,
-	TPS80031_PREQ2C,
-	TPS80031_PREQ3,
-	TPS80031_NRES_WARM,
-	TPS80031_PWM_FORCE,
-	TPS80031_CHRG_EXT_CHRG_STATZ,
-	TPS80031_SIM,
-	TPS80031_MMC,
-	TPS80031_GPADC_START,
-	TPS80031_DVSI2C_SCL,
-	TPS80031_DVSI2C_SDA,
-	TPS80031_CTLI2C_SCL,
-	TPS80031_CTLI2C_SDA,
-};
-
-enum tps80031_pupd_settings {
-	TPS80031_PUPD_NORMAL,
-	TPS80031_PUPD_PULLDOWN,
-	TPS80031_PUPD_PULLUP,
-};
-
-struct tps80031 {
-	struct device		*dev;
-	unsigned long		chip_info;
-	int			es_version;
-	struct i2c_client	*clients[TPS80031_NUM_SLAVES];
-	struct regmap		*regmap[TPS80031_NUM_SLAVES];
-	struct regmap_irq_chip_data *irq_data;
-};
-
-struct tps80031_pupd_init_data {
-	int input_pin;
-	int setting;
-};
-
-/*
- * struct tps80031_regulator_platform_data - tps80031 regulator platform data.
- *
- * @reg_init_data: The regulator init data.
- * @ext_ctrl_flag: External control flag for sleep/power request control.
- * @config_flags: Configuration flag to configure the rails.
- *		  It should be ORed of config enums.
- */
-
-struct tps80031_regulator_platform_data {
-	struct regulator_init_data *reg_init_data;
-	unsigned int ext_ctrl_flag;
-	unsigned int config_flags;
-};
-
-struct tps80031_platform_data {
-	int irq_base;
-	bool use_power_off;
-	struct tps80031_pupd_init_data *pupd_init_data;
-	int pupd_init_data_size;
-	struct tps80031_regulator_platform_data
-			*regulator_pdata[TPS80031_REGULATOR_MAX];
-};
-
-static inline int tps80031_write(struct device *dev, int sid,
-		int reg, uint8_t val)
-{
-	struct tps80031 *tps80031 = dev_get_drvdata(dev);
-
-	return regmap_write(tps80031->regmap[sid], reg, val);
-}
-
-static inline int tps80031_writes(struct device *dev, int sid, int reg,
-		int len, uint8_t *val)
-{
-	struct tps80031 *tps80031 = dev_get_drvdata(dev);
-
-	return regmap_bulk_write(tps80031->regmap[sid], reg, val, len);
-}
-
-static inline int tps80031_read(struct device *dev, int sid,
-		int reg, uint8_t *val)
-{
-	struct tps80031 *tps80031 = dev_get_drvdata(dev);
-	unsigned int ival;
-	int ret;
-
-	ret = regmap_read(tps80031->regmap[sid], reg, &ival);
-	if (ret < 0) {
-		dev_err(dev, "failed reading from reg 0x%02x\n", reg);
-		return ret;
-	}
-
-	*val = ival;
-	return ret;
-}
-
-static inline int tps80031_reads(struct device *dev, int sid,
-		int reg, int len, uint8_t *val)
-{
-	struct tps80031 *tps80031 = dev_get_drvdata(dev);
-
-	return regmap_bulk_read(tps80031->regmap[sid], reg, val, len);
-}
-
-static inline int tps80031_set_bits(struct device *dev, int sid,
-		int reg, uint8_t bit_mask)
-{
-	struct tps80031 *tps80031 = dev_get_drvdata(dev);
-
-	return regmap_update_bits(tps80031->regmap[sid], reg,
-				bit_mask, bit_mask);
-}
-
-static inline int tps80031_clr_bits(struct device *dev, int sid,
-		int reg, uint8_t bit_mask)
-{
-	struct tps80031 *tps80031 = dev_get_drvdata(dev);
-
-	return regmap_update_bits(tps80031->regmap[sid], reg, bit_mask, 0);
-}
-
-static inline int tps80031_update(struct device *dev, int sid,
-		int reg, uint8_t val, uint8_t mask)
-{
-	struct tps80031 *tps80031 = dev_get_drvdata(dev);
-
-	return regmap_update_bits(tps80031->regmap[sid], reg, mask, val);
-}
-
-static inline unsigned long tps80031_get_chip_info(struct device *dev)
-{
-	struct tps80031 *tps80031 = dev_get_drvdata(dev);
-
-	return tps80031->chip_info;
-}
-
-static inline int tps80031_get_pmu_version(struct device *dev)
-{
-	struct tps80031 *tps80031 = dev_get_drvdata(dev);
-
-	return tps80031->es_version;
-}
-
-static inline int tps80031_irq_get_virq(struct device *dev, int irq)
-{
-	struct tps80031 *tps80031 = dev_get_drvdata(dev);
-
-	return regmap_irq_get_virq(tps80031->irq_data, irq);
-}
-
-extern int tps80031_ext_power_req_config(struct device *dev,
-		unsigned long ext_ctrl_flag, int preq_bit,
-		int state_reg_add, int trans_reg_add);
-#endif /*__LINUX_MFD_TPS80031_H */
-- 
cgit v1.2.3


From d755ad8dc752d44545613ea04d660aed674e540d Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:00 -0400
Subject: NFS: Create a new nfs_alloc_fattr_with_label() function

For creating fattrs with the label field already allocated for us. I
also update nfs_free_fattr() to free the label in the end.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 739ca1ef934f..88c3aed8ad39 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -426,9 +426,22 @@ extern void nfs_fattr_set_barrier(struct nfs_fattr *fattr);
 extern unsigned long nfs_inc_attr_generation_counter(void);
 
 extern struct nfs_fattr *nfs_alloc_fattr(void);
+extern struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server);
+
+static inline void nfs4_label_free(struct nfs4_label *label)
+{
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+	if (label) {
+		kfree(label->label);
+		kfree(label);
+	}
+#endif
+}
 
 static inline void nfs_free_fattr(const struct nfs_fattr *fattr)
 {
+	if (fattr)
+		nfs4_label_free(fattr->label);
 	kfree(fattr);
 }
 
-- 
cgit v1.2.3


From b1db9a401d464d526d5941f0544e7c9ea37fa731 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:01 -0400
Subject: NFS: Remove the nfs4_label from the nfs_entry struct

And instead allocate the fattr using nfs_alloc_fattr_with_label()

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_xdr.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e9698b6278a5..9960f6628066 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -753,7 +753,6 @@ struct nfs_entry {
 	int			eof;
 	struct nfs_fh *		fh;
 	struct nfs_fattr *	fattr;
-	struct nfs4_label  *label;
 	unsigned char		d_type;
 	struct nfs_server *	server;
 };
-- 
cgit v1.2.3


From 68be1742c22983558f0f148a4467eb9127d56b86 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:02 -0400
Subject: NFS: Remove the nfs4_label from the nfs4_create_res struct

Instead, use the label embedded in the attached fattr.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_xdr.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9960f6628066..5aba81b74c98 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1040,7 +1040,6 @@ struct nfs4_create_res {
 	const struct nfs_server *	server;
 	struct nfs_fh *			fh;
 	struct nfs_fattr *		fattr;
-	struct nfs4_label		*label;
 	struct nfs4_change_info		dir_cinfo;
 };
 
-- 
cgit v1.2.3


From aa7ca3b2de190675543d84adaa1ff74e7867c76f Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:03 -0400
Subject: NFS: Remove the nfs4_label from the nfs4_link_res struct

Again, use the fattr's label field instead.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_xdr.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 5aba81b74c98..d55bf3fd5167 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1079,7 +1079,6 @@ struct nfs4_link_res {
 	struct nfs4_sequence_res	seq_res;
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
-	struct nfs4_label		*label;
 	struct nfs4_change_info		cinfo;
 	struct nfs_fattr *		dir_attr;
 };
-- 
cgit v1.2.3


From 9558a007dbc383d48e7f5a123d0b5ff656c71068 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:04 -0400
Subject: NFS: Remove the label from the nfs4_lookup_res struct

And usethe fattr's label field instead. I also adjust function calls to
remove labels along the way.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_xdr.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d55bf3fd5167..95219d5a8668 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1095,7 +1095,6 @@ struct nfs4_lookup_res {
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
 	struct nfs_fh *			fh;
-	struct nfs4_label		*label;
 };
 
 struct nfs4_lookupp_arg {
@@ -1740,8 +1739,7 @@ struct nfs_rpc_ops {
 	int	(*setattr) (struct dentry *, struct nfs_fattr *,
 			    struct iattr *);
 	int	(*lookup)  (struct inode *, struct dentry *,
-			    struct nfs_fh *, struct nfs_fattr *,
-			    struct nfs4_label *);
+			    struct nfs_fh *, struct nfs_fattr *);
 	int	(*lookupp) (struct inode *, struct nfs_fh *,
 			    struct nfs_fattr *, struct nfs4_label *);
 	int	(*access)  (struct inode *, struct nfs_access_entry *);
-- 
cgit v1.2.3


From ba4bc8dc4d937df2b407393435a302550be0ad82 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:05 -0400
Subject: NFS: Remove the nfs4_label from the nfs4_lookupp_res struct

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_xdr.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 95219d5a8668..f0a685d9b8bd 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1108,7 +1108,6 @@ struct nfs4_lookupp_res {
 	const struct nfs_server		*server;
 	struct nfs_fattr		*fattr;
 	struct nfs_fh			*fh;
-	struct nfs4_label		*label;
 };
 
 struct nfs4_lookup_root_arg {
@@ -1741,7 +1740,7 @@ struct nfs_rpc_ops {
 	int	(*lookup)  (struct inode *, struct dentry *,
 			    struct nfs_fh *, struct nfs_fattr *);
 	int	(*lookupp) (struct inode *, struct nfs_fh *,
-			    struct nfs_fattr *, struct nfs4_label *);
+			    struct nfs_fattr *);
 	int	(*access)  (struct inode *, struct nfs_access_entry *);
 	int	(*readlink)(struct inode *, struct page *, unsigned int,
 			    unsigned int);
-- 
cgit v1.2.3


From 76baa2b29c7161bc65a3051d311297b7d7fc827a Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:06 -0400
Subject: NFS: Remove the f_label from the nfs4_opendata and nfs_openres

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_xdr.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f0a685d9b8bd..cb28e01ea41e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -488,7 +488,6 @@ struct nfs_openres {
 	struct nfs4_change_info	cinfo;
 	__u32                   rflags;
 	struct nfs_fattr *      f_attr;
-	struct nfs4_label	*f_label;
 	struct nfs_seqid *	seqid;
 	const struct nfs_server *server;
 	fmode_t			delegation_type;
-- 
cgit v1.2.3


From 2ef61e0eaa333e4e9c348c41a4b7abfb34b8736d Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:07 -0400
Subject: NFS: Remove the nfs4_label from the nfs4_getattr_res

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_xdr.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index cb28e01ea41e..817f1bf5f187 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1063,7 +1063,6 @@ struct nfs4_getattr_res {
 	struct nfs4_sequence_res	seq_res;
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
-	struct nfs4_label		*label;
 };
 
 struct nfs4_link_arg {
@@ -1732,8 +1731,7 @@ struct nfs_rpc_ops {
 	int	(*submount) (struct fs_context *, struct nfs_server *);
 	int	(*try_get_tree) (struct fs_context *);
 	int	(*getattr) (struct nfs_server *, struct nfs_fh *,
-			    struct nfs_fattr *, struct nfs4_label *,
-			    struct inode *);
+			    struct nfs_fattr *, struct inode *);
 	int	(*setattr) (struct dentry *, struct nfs_fattr *,
 			    struct iattr *);
 	int	(*lookup)  (struct inode *, struct dentry *,
-- 
cgit v1.2.3


From 1b00ad657997c8984a9e627a3bd37ea14f20beb2 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:08 -0400
Subject: NFS: Remove the nfs4_label from the nfs_setattrres

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_xdr.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 817f1bf5f187..967a0098f0a9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -832,7 +832,6 @@ struct nfs_getaclres {
 struct nfs_setattrres {
 	struct nfs4_sequence_res	seq_res;
 	struct nfs_fattr *              fattr;
-	struct nfs4_label		*label;
 	const struct nfs_server *	server;
 };
 
-- 
cgit v1.2.3


From d91bfc46426d3d772fc0d9d165e3435fd0f0a79e Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:09 -0400
Subject: NFS: Remove the nfs4_label argument from nfs_instantiate()

Pull the label from the fattr instead.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 88c3aed8ad39..a8a9b71aeea6 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -532,7 +532,7 @@ extern struct dentry *nfs_add_or_obtain(struct dentry *dentry,
 			struct nfs_fh *fh, struct nfs_fattr *fattr,
 			struct nfs4_label *label);
 extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh,
-			struct nfs_fattr *fattr, struct nfs4_label *label);
+			struct nfs_fattr *fattr);
 extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags);
 extern void nfs_access_zap_cache(struct inode *inode);
 extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res,
-- 
cgit v1.2.3


From cc6f32989c3202349b90edde0c4702b098410fe8 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:10 -0400
Subject: NFS: Remove the nfs4_label argument from nfs_add_or_obtain()

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a8a9b71aeea6..6eda001b306b 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -529,8 +529,7 @@ extern void nfs_set_verifier(struct dentry * dentry, unsigned long verf);
 extern void nfs_clear_verifier_delegated(struct inode *inode);
 #endif /* IS_ENABLED(CONFIG_NFS_V4) */
 extern struct dentry *nfs_add_or_obtain(struct dentry *dentry,
-			struct nfs_fh *fh, struct nfs_fattr *fattr,
-			struct nfs4_label *label);
+			struct nfs_fh *fh, struct nfs_fattr *fattr);
 extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh,
 			struct nfs_fattr *fattr);
 extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags);
-- 
cgit v1.2.3


From cf7ab00aabbf9c8f1ec72edff15849ddc23aa6a7 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:11 -0400
Subject: NFS: Remove the nfs4_label argument from nfs_fhget()

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 6eda001b306b..c36c6a559fc9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -388,7 +388,7 @@ extern void nfs_zap_caches(struct inode *);
 extern void nfs_set_inode_stale(struct inode *inode);
 extern void nfs_invalidate_atime(struct inode *);
 extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *,
-				struct nfs_fattr *, struct nfs4_label *);
+				struct nfs_fattr *);
 struct inode *nfs_ilookup(struct super_block *sb, struct nfs_fattr *, struct nfs_fh *);
 extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
-- 
cgit v1.2.3


From dd225cb3b02b827271a2284f89102fc81efcbf6f Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 22 Oct 2021 13:11:12 -0400
Subject: NFS: Remove the nfs4_label argument from nfs_setsecurity

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c36c6a559fc9..05f249f20f55 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -409,8 +409,7 @@ extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *map
 extern int nfs_revalidate_mapping_rcu(struct inode *inode);
 extern int nfs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *);
-extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
-				struct nfs4_label *label);
+extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode);
-- 
cgit v1.2.3


From 3990ed4c426652fcd469f8c9dc08156294b36c28 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 5 Nov 2021 18:40:14 -0700
Subject: bpf: Stop caching subprog index in the bpf_pseudo_func insn

This patch is to fix an out-of-bound access issue when jit-ing the
bpf_pseudo_func insn (i.e. ld_imm64 with src_reg == BPF_PSEUDO_FUNC)

In jit_subprog(), it currently reuses the subprog index cached in
insn[1].imm.  This subprog index is an index into a few array related
to subprogs.  For example, in jit_subprog(), it is an index to the newly
allocated 'struct bpf_prog **func' array.

The subprog index was cached in insn[1].imm after add_subprog().  However,
this could become outdated (and too big in this case) if some subprogs
are completely removed during dead code elimination (in
adjust_subprog_starts_after_remove).  The cached index in insn[1].imm
is not updated accordingly and causing out-of-bound issue in the later
jit_subprog().

Unlike bpf_pseudo_'func' insn, the current bpf_pseudo_'call' insn
is handling the DCE properly by calling find_subprog(insn->imm) to
figure out the index instead of caching the subprog index.
The existing bpf_adj_branches() will adjust the insn->imm
whenever insn is added or removed.

Instead of having two ways handling subprog index,
this patch is to make bpf_pseudo_func works more like
bpf_pseudo_call.

First change is to stop caching the subprog index result
in insn[1].imm after add_subprog().  The verification
process will use find_subprog(insn->imm) to figure
out the subprog index.

Second change is in bpf_adj_branches() and have it to
adjust the insn->imm for the bpf_pseudo_func insn also
whenever insn is added or removed.

Third change is in jit_subprog().  Like the bpf_pseudo_call handling,
bpf_pseudo_func temporarily stores the find_subprog() result
in insn->off.  It is fine because the prog's insn has been finalized
at this point.  insn->off will be reset back to 0 later to avoid
confusing the userspace prog dump tool.

Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211106014014.651018-1-kafai@fb.com
---
 include/linux/bpf.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2be6dfd68df9..f715e8863f4d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -484,6 +484,12 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
 	aux->ctx_field_size = size;
 }
 
+static inline bool bpf_pseudo_func(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
+	       insn->src_reg == BPF_PSEUDO_FUNC;
+}
+
 struct bpf_prog_ops {
 	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
 			union bpf_attr __user *uattr);
-- 
cgit v1.2.3


From 8587ca6f34152ea650bad4b2db68456601159024 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 5 Nov 2021 13:35:07 -0700
Subject: mm: move kvmalloc-related functions to slab.h

Not all files in the kernel should include mm.h.  Migrating callers from
kmalloc to kvmalloc is easier if the kvmalloc functions are in slab.h.

[akpm@linux-foundation.org: move the new kvrealloc() also]
[akpm@linux-foundation.org: drivers/hwmon/occ/p9_sbe.c needs slab.h]

Link: https://lkml.kernel.org/r/20210622215757.3525604-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h   | 34 ----------------------------------
 include/linux/slab.h | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 73a52aba448f..32b2ecc47408 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -799,40 +799,6 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 }
 #endif
 
-extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
-static inline void *kvmalloc(size_t size, gfp_t flags)
-{
-	return kvmalloc_node(size, flags, NUMA_NO_NODE);
-}
-static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
-{
-	return kvmalloc_node(size, flags | __GFP_ZERO, node);
-}
-static inline void *kvzalloc(size_t size, gfp_t flags)
-{
-	return kvmalloc(size, flags | __GFP_ZERO);
-}
-
-static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
-{
-	size_t bytes;
-
-	if (unlikely(check_mul_overflow(n, size, &bytes)))
-		return NULL;
-
-	return kvmalloc(bytes, flags);
-}
-
-static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
-{
-	return kvmalloc_array(n, size, flags | __GFP_ZERO);
-}
-
-extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
-		gfp_t flags);
-extern void kvfree(const void *addr);
-extern void kvfree_sensitive(const void *addr, size_t len);
-
 static inline int head_compound_mapcount(struct page *head)
 {
 	return atomic_read(compound_mapcount_ptr(head)) + 1;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 083f3ce550bc..c0d46b6fa12a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -732,6 +732,40 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
+extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc_node(size, flags, NUMA_NO_NODE);
+}
+static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
+{
+	return kvmalloc_node(size, flags | __GFP_ZERO, node);
+}
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+	size_t bytes;
+
+	if (unlikely(check_mul_overflow(n, size, &bytes)))
+		return NULL;
+
+	return kvmalloc(bytes, flags);
+}
+
+static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
+{
+	return kvmalloc_array(n, size, flags | __GFP_ZERO);
+}
+
+extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
+		gfp_t flags);
+extern void kvfree(const void *addr);
+extern void kvfree_sensitive(const void *addr, size_t len);
+
 unsigned int kmem_cache_size(struct kmem_cache *s);
 void __init kmem_cache_init_late(void);
 
-- 
cgit v1.2.3


From b47291ef02b0bee85ffb7efd6c336060ad1fe1a4 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 5 Nov 2021 13:35:17 -0700
Subject: mm, slub: change percpu partial accounting from objects to pages

With CONFIG_SLUB_CPU_PARTIAL enabled, SLUB keeps a percpu list of
partial slabs that can be promoted to cpu slab when the previous one is
depleted, without accessing the shared partial list.  A slab can be
added to this list by 1) refill of an empty list from get_partial_node()
- once we really have to access the shared partial list, we acquire
multiple slabs to amortize the cost of locking, and 2) first free to a
previously full slab - instead of putting the slab on a shared partial
list, we can more cheaply freeze it and put it on the per-cpu list.

To control how large a percpu partial list can grow for a kmem cache,
set_cpu_partial() calculates a target number of free objects on each
cpu's percpu partial list, and this can be also set by the sysfs file
cpu_partial.

However, the tracking of actual number of objects is imprecise, in order
to limit overhead from cpu X freeing an objects to a slab on percpu
partial list of cpu Y.  Basically, the percpu partial slabs form a
single linked list, and when we add a new slab to the list with current
head "oldpage", we set in the struct page of the slab we're adding:

    page->pages = oldpage->pages + 1; // this is precise
    page->pobjects = oldpage->pobjects + (page->objects - page->inuse);
    page->next = oldpage;

Thus the real number of free objects in the slab (objects - inuse) is
only determined at the moment of adding the slab to the percpu partial
list, and further freeing doesn't update the pobjects counter nor
propagate it to the current list head.  As Jann reports [1], this can
easily lead to large inaccuracies, where the target number of objects
(up to 30 by default) can translate to the same number of (empty) slab
pages on the list.  In case 2) above, we put a slab with 1 free object
on the list, thus only increase page->pobjects by 1, even if there are
subsequent frees on the same slab.  Jann has noticed this in practice
and so did we [2] when investigating significant increase of kmemcg
usage after switching from SLAB to SLUB.

While this is no longer a problem in kmemcg context thanks to the
accounting rewrite in 5.9, the memory waste is still not ideal and it's
questionable whether it makes sense to perform free object count based
control when object counts can easily become so much inaccurate.  So
this patch converts the accounting to be based on number of pages only
(which is precise) and removes the page->pobjects field completely.
This is also ultimately simpler.

To retain the existing set_cpu_partial() heuristic, first calculate the
target number of objects as previously, but then convert it to target
number of pages by assuming the pages will be half-filled on average.
This assumption might obviously also be inaccurate in practice, but
cannot degrade to actual number of pages being equal to the target
number of objects.

We could also skip the intermediate step with target number of objects
and rewrite the heuristic in terms of pages.  However we still have the
sysfs file cpu_partial which uses number of objects and could break
existing users if it suddenly becomes number of pages, so this patch
doesn't do that.

In practice, after this patch the heuristics limit the size of percpu
partial list up to 2 pages.  In case of a reported regression (which
would mean some workload has benefited from the previous imprecise
object based counting), we can tune the heuristics to get a better
compromise within the new scheme, while still avoid the unexpectedly
long percpu partial lists.

[1] https://lore.kernel.org/linux-mm/CAG48ez2Qx5K1Cab-m8BdSibp6wLTip6ro4=-umR7BLsEgjEYzA@mail.gmail.com/
[2] https://lore.kernel.org/all/2f0f46e8-2535-410a-1859-e9cfa4e57c18@suse.cz/

==========
Evaluation
==========

Mel was kind enough to run v1 through mmtests machinery for netperf
(localhost) and hackbench and, for most significant results see below.
So there are some apparent regressions, especially with hackbench, which
I think ultimately boils down to having shorter percpu partial lists on
average and some benchmarks benefiting from longer ones.  Monitoring
slab usage also indicated less memory usage by slab.  Based on that, the
following patch will bump the defaults to allow longer percpu partial
lists than after this patch.

However the goal is certainly not such that we would limit the percpu
partial lists to 30 pages just because previously a specific alloc/free
pattern could lead to the limit of 30 objects translate to a limit to 30
pages - that would make little sense.  This is a correctness patch, and
if a workload benefits from larger lists, the sysfs tuning knobs are
still there to allow that.

Netperf

  2-socket Intel(R) Xeon(R) Gold 5218R CPU @ 2.10GHz (20 cores, 40 threads per socket), 384GB RAM
  TCP-RR:
    hmean before 127045.79 after 121092.94 (-4.69%, worse)
    stddev before  2634.37 after   1254.08
  UDP-RR:
    hmean before 166985.45 after 160668.94 ( -3.78%, worse)
    stddev before 4059.69 after 1943.63

  2-socket Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz (20 cores, 40 threads per socket), 512GB RAM
  TCP-RR:
    hmean before 84173.25 after 76914.72 ( -8.62%, worse)
  UDP-RR:
    hmean before 93571.12 after 96428.69 ( 3.05%, better)
    stddev before 23118.54 after 16828.14

  2-socket Intel(R) Xeon(R) CPU E5-2670 v3 @ 2.30GHz (12 cores, 24 threads per socket), 64GB RAM
  TCP-RR:
    hmean before 49984.92 after 48922.27 ( -2.13%, worse)
    stddev before 6248.15 after 4740.51
  UDP-RR:
    hmean before 61854.31 after 68761.81 ( 11.17%, better)
    stddev before 4093.54 after 5898.91

  other machines - within 2%

Hackbench

  (results before and after the patch, negative % means worse)

  2-socket AMD EPYC 7713 (64 cores, 128 threads per core), 256GB RAM
  hackbench-process-sockets
  Amean 	1 	0.5380	0.5583	( -3.78%)
  Amean 	4 	0.7510	0.8150	( -8.52%)
  Amean 	7 	0.7930	0.9533	( -20.22%)
  Amean 	12 	0.7853	1.1313	( -44.06%)
  Amean 	21 	1.1520	1.4993	( -30.15%)
  Amean 	30 	1.6223	1.9237	( -18.57%)
  Amean 	48 	2.6767	2.9903	( -11.72%)
  Amean 	79 	4.0257	5.1150	( -27.06%)
  Amean 	110	5.5193	7.4720	( -35.38%)
  Amean 	141	7.2207	9.9840	( -38.27%)
  Amean 	172	8.4770	12.1963	( -43.88%)
  Amean 	203	9.6473	14.3137	( -48.37%)
  Amean 	234	11.3960	18.7917	( -64.90%)
  Amean 	265	13.9627	22.4607	( -60.86%)
  Amean 	296	14.9163	26.0483	( -74.63%)

  hackbench-thread-sockets
  Amean 	1 	0.5597	0.5877	( -5.00%)
  Amean 	4 	0.7913	0.8960	( -13.23%)
  Amean 	7 	0.8190	1.0017	( -22.30%)
  Amean 	12 	0.9560	1.1727	( -22.66%)
  Amean 	21 	1.7587	1.5660	( 10.96%)
  Amean 	30 	2.4477	1.9807	( 19.08%)
  Amean 	48 	3.4573	3.0630	( 11.41%)
  Amean 	79 	4.7903	5.1733	( -8.00%)
  Amean 	110	6.1370	7.4220	( -20.94%)
  Amean 	141	7.5777	9.2617	( -22.22%)
  Amean 	172	9.2280	11.0907	( -20.18%)
  Amean 	203	10.2793	13.3470	( -29.84%)
  Amean 	234	11.2410	17.1070	( -52.18%)
  Amean 	265	12.5970	23.3323	( -85.22%)
  Amean 	296	17.1540	24.2857	( -41.57%)

  2-socket Intel(R) Xeon(R) Gold 5218R CPU @ 2.10GHz (20 cores, 40 threads
  per socket), 384GB RAM
  hackbench-process-sockets
  Amean 	1 	0.5760	0.4793	( 16.78%)
  Amean 	4 	0.9430	0.9707	( -2.93%)
  Amean 	7 	1.5517	1.8843	( -21.44%)
  Amean 	12 	2.4903	2.7267	( -9.49%)
  Amean 	21 	3.9560	4.2877	( -8.38%)
  Amean 	30 	5.4613	5.8343	( -6.83%)
  Amean 	48 	8.5337	9.2937	( -8.91%)
  Amean 	79 	14.0670	15.2630	( -8.50%)
  Amean 	110	19.2253	21.2467	( -10.51%)
  Amean 	141	23.7557	25.8550	( -8.84%)
  Amean 	172	28.4407	29.7603	( -4.64%)
  Amean 	203	33.3407	33.9927	( -1.96%)
  Amean 	234	38.3633	39.1150	( -1.96%)
  Amean 	265	43.4420	43.8470	( -0.93%)
  Amean 	296	48.3680	48.9300	( -1.16%)

  hackbench-thread-sockets
  Amean 	1 	0.6080	0.6493	( -6.80%)
  Amean 	4 	1.0000	1.0513	( -5.13%)
  Amean 	7 	1.6607	2.0260	( -22.00%)
  Amean 	12 	2.7637	2.9273	( -5.92%)
  Amean 	21 	5.0613	4.5153	( 10.79%)
  Amean 	30 	6.3340	6.1140	( 3.47%)
  Amean 	48 	9.0567	9.5577	( -5.53%)
  Amean 	79 	14.5657	15.7983	( -8.46%)
  Amean 	110	19.6213	21.6333	( -10.25%)
  Amean 	141	24.1563	26.2697	( -8.75%)
  Amean 	172	28.9687	30.2187	( -4.32%)
  Amean 	203	33.9763	34.6970	( -2.12%)
  Amean 	234	38.8647	39.3207	( -1.17%)
  Amean 	265	44.0813	44.1507	( -0.16%)
  Amean 	296	49.2040	49.4330	( -0.47%)

  2-socket Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz (20 cores, 40 threads
  per socket), 512GB RAM
  hackbench-process-sockets
  Amean 	1 	0.5027	0.5017	( 0.20%)
  Amean 	4 	1.1053	1.2033	( -8.87%)
  Amean 	7 	1.8760	2.1820	( -16.31%)
  Amean 	12 	2.9053	3.1810	( -9.49%)
  Amean 	21 	4.6777	4.9920	( -6.72%)
  Amean 	30 	6.5180	6.7827	( -4.06%)
  Amean 	48 	10.0710	10.5227	( -4.48%)
  Amean 	79 	16.4250	17.5053	( -6.58%)
  Amean 	110	22.6203	24.4617	( -8.14%)
  Amean 	141	28.0967	31.0363	( -10.46%)
  Amean 	172	34.4030	36.9233	( -7.33%)
  Amean 	203	40.5933	43.0850	( -6.14%)
  Amean 	234	46.6477	48.7220	( -4.45%)
  Amean 	265	53.0530	53.9597	( -1.71%)
  Amean 	296	59.2760	59.9213	( -1.09%)

  hackbench-thread-sockets
  Amean 	1 	0.5363	0.5330	( 0.62%)
  Amean 	4 	1.1647	1.2157	( -4.38%)
  Amean 	7 	1.9237	2.2833	( -18.70%)
  Amean 	12 	2.9943	3.3110	( -10.58%)
  Amean 	21 	4.9987	5.1880	( -3.79%)
  Amean 	30 	6.7583	7.0043	( -3.64%)
  Amean 	48 	10.4547	10.8353	( -3.64%)
  Amean 	79 	16.6707	17.6790	( -6.05%)
  Amean 	110	22.8207	24.4403	( -7.10%)
  Amean 	141	28.7090	31.0533	( -8.17%)
  Amean 	172	34.9387	36.8260	( -5.40%)
  Amean 	203	41.1567	43.0450	( -4.59%)
  Amean 	234	47.3790	48.5307	( -2.43%)
  Amean 	265	53.9543	54.6987	( -1.38%)
  Amean 	296	60.0820	60.2163	( -0.22%)

  1-socket Intel(R) Xeon(R) CPU E3-1240 v5 @ 3.50GHz (4 cores, 8 threads),
  32 GB RAM
  hackbench-process-sockets
  Amean 	1 	1.4760	1.5773	( -6.87%)
  Amean 	3 	3.9370	4.0910	( -3.91%)
  Amean 	5 	6.6797	6.9357	( -3.83%)
  Amean 	7 	9.3367	9.7150	( -4.05%)
  Amean 	12	15.7627	16.1400	( -2.39%)
  Amean 	18	23.5360	23.6890	( -0.65%)
  Amean 	24	31.0663	31.3137	( -0.80%)
  Amean 	30	38.7283	39.0037	( -0.71%)
  Amean 	32	41.3417	41.6097	( -0.65%)

  hackbench-thread-sockets
  Amean 	1 	1.5250	1.6043	( -5.20%)
  Amean 	3 	4.0897	4.2603	( -4.17%)
  Amean 	5 	6.7760	7.0933	( -4.68%)
  Amean 	7 	9.4817	9.9157	( -4.58%)
  Amean 	12	15.9610	16.3937	( -2.71%)
  Amean 	18	23.9543	24.3417	( -1.62%)
  Amean 	24	31.4400	31.7217	( -0.90%)
  Amean 	30	39.2457	39.5467	( -0.77%)
  Amean 	32	41.8267	42.1230	( -0.71%)

  2-socket Intel(R) Xeon(R) CPU E5-2670 v3 @ 2.30GHz (12 cores, 24 threads
  per socket), 64GB RAM
  hackbench-process-sockets
  Amean 	1 	1.0347	1.0880	( -5.15%)
  Amean 	4 	1.7267	1.8527	( -7.30%)
  Amean 	7 	2.6707	2.8110	( -5.25%)
  Amean 	12 	4.1617	4.3383	( -4.25%)
  Amean 	21 	7.0070	7.2600	( -3.61%)
  Amean 	30 	9.9187	10.2397	( -3.24%)
  Amean 	48 	15.6710	16.3923	( -4.60%)
  Amean 	79 	24.7743	26.1247	( -5.45%)
  Amean 	110	34.3000	35.9307	( -4.75%)
  Amean 	141	44.2043	44.8010	( -1.35%)
  Amean 	172	54.2430	54.7260	( -0.89%)
  Amean 	192	60.6557	60.9777	( -0.53%)

  hackbench-thread-sockets
  Amean 	1 	1.0610	1.1353	( -7.01%)
  Amean 	4 	1.7543	1.9140	( -9.10%)
  Amean 	7 	2.7840	2.9573	( -6.23%)
  Amean 	12 	4.3813	4.4937	( -2.56%)
  Amean 	21 	7.3460	7.5350	( -2.57%)
  Amean 	30 	10.2313	10.5190	( -2.81%)
  Amean 	48 	15.9700	16.5940	( -3.91%)
  Amean 	79 	25.3973	26.6637	( -4.99%)
  Amean 	110	35.1087	36.4797	( -3.91%)
  Amean 	141	45.8220	46.3053	( -1.05%)
  Amean 	172	55.4917	55.7320	( -0.43%)
  Amean 	192	62.7490	62.5410	( 0.33%)

Link: https://lkml.kernel.org/r/20211012134651.11258-1-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Jann Horn <jannh@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |  2 --
 include/linux/slub_def.h | 13 ++-----------
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f8ee09c711f..68ffa064b7a8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -124,10 +124,8 @@ struct page {
 					struct page *next;
 #ifdef CONFIG_64BIT
 					int pages;	/* Nr of pages left */
-					int pobjects;	/* Approximate count */
 #else
 					short int pages;
-					short int pobjects;
 #endif
 				};
 			};
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 85499f0586b0..0fa751b946fa 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -99,6 +99,8 @@ struct kmem_cache {
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	/* Number of per cpu partial objects to keep around */
 	unsigned int cpu_partial;
+	/* Number of per cpu partial pages to keep around */
+	unsigned int cpu_partial_pages;
 #endif
 	struct kmem_cache_order_objects oo;
 
@@ -141,17 +143,6 @@ struct kmem_cache {
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-#define slub_cpu_partial(s)		((s)->cpu_partial)
-#define slub_set_cpu_partial(s, n)		\
-({						\
-	slub_cpu_partial(s) = (n);		\
-})
-#else
-#define slub_cpu_partial(s)		(0)
-#define slub_set_cpu_partial(s, n)
-#endif /* CONFIG_SLUB_CPU_PARTIAL */
-
 #ifdef CONFIG_SYSFS
 #define SLAB_SUPPORTS_SYSFS
 void sysfs_slab_unlink(struct kmem_cache *);
-- 
cgit v1.2.3


From 96c84dde362a3e4ff67a12eaac2ea6e88e963c07 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Nov 2021 13:35:30 -0700
Subject: mm: don't include <linux/dax.h> in <linux/mempolicy.h>

Not required at all, and having this causes a huge kernel rebuild as
soon as something in dax.h changes.

Link: https://lkml.kernel.org/r/20210921082253.1859794-1-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 4091692bed8c..097bbbb37493 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -8,7 +8,6 @@
 
 #include <linux/sched.h>
 #include <linux/mmzone.h>
-#include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
-- 
cgit v1.2.3


From 7857ccdf94e94f1dd56496b90084fdcaa5e655a4 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 5 Nov 2021 13:35:33 -0700
Subject: lib/stackdepot: include gfp.h

Patch series "stackdepot, kasan, workqueue: Avoid expanding stackdepot
slabs when holding raw_spin_lock", v2.

Shuah Khan reported [1]:

 | When CONFIG_PROVE_RAW_LOCK_NESTING=y and CONFIG_KASAN are enabled,
 | kasan_record_aux_stack() runs into "BUG: Invalid wait context" when
 | it tries to allocate memory attempting to acquire spinlock in page
 | allocation code while holding workqueue pool raw_spinlock.
 |
 | There are several instances of this problem when block layer tries
 | to __queue_work(). Call trace from one of these instances is below:
 |
 |     kblockd_mod_delayed_work_on()
 |       mod_delayed_work_on()
 |         __queue_delayed_work()
 |           __queue_work() (rcu_read_lock, raw_spin_lock pool->lock held)
 |             insert_work()
 |               kasan_record_aux_stack()
 |                 kasan_save_stack()
 |                   stack_depot_save()
 |                     alloc_pages()
 |                       __alloc_pages()
 |                         get_page_from_freelist()
 |                           rm_queue()
 |                             rm_queue_pcplist()
 |                               local_lock_irqsave(&pagesets.lock, flags);
 |                               [ BUG: Invalid wait context triggered ]

PROVE_RAW_LOCK_NESTING is pointing out that (on RT kernels) the locking
rules are being violated.  More generally, memory is being allocated
from a non-preemptive context (raw_spin_lock'd c-s) where it is not
allowed.

To properly fix this, we must prevent stackdepot from replenishing its
"stack slab" pool if memory allocations cannot be done in the current
context: it's a bug to use either GFP_ATOMIC nor GFP_NOWAIT in certain
non-preemptive contexts, including raw_spin_locks (see gfp.h and commit
ab00db216c9c7).

The only downside is that saving a stack trace may fail if: stackdepot
runs out of space AND the same stack trace has not been recorded before.
I expect this to be unlikely, and a simple experiment (boot the kernel)
didn't result in any failure to record stack trace from insert_work().

The series includes a few minor fixes to stackdepot that I noticed in
preparing the series.  It then introduces __stack_depot_save(), which
exposes the option to force stackdepot to not allocate any memory.
Finally, KASAN is changed to use the new stackdepot interface and
provide kasan_record_aux_stack_noalloc(), which is then used by
workqueue code.

[1] https://lkml.kernel.org/r/20210902200134.25603-1-skhan@linuxfoundation.org

This patch (of 6):

<linux/stackdepot.h> refers to gfp_t, but doesn't include gfp.h.

Fix it by including <linux/gfp.h>.

Link: https://lkml.kernel.org/r/20210913112609.2651084-1-elver@google.com
Link: https://lkml.kernel.org/r/20210913112609.2651084-2-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Tested-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Walter Wu <walter-zh.wu@mediatek.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Vijayanand Jitta <vjitta@codeaurora.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: Taras Madan <tarasmadan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/stackdepot.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 6bb4bc1a5f54..97b36dc53301 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -11,6 +11,8 @@
 #ifndef _LINUX_STACKDEPOT_H
 #define _LINUX_STACKDEPOT_H
 
+#include <linux/gfp.h>
+
 typedef u32 depot_stack_handle_t;
 
 depot_stack_handle_t stack_depot_save(unsigned long *entries,
-- 
cgit v1.2.3


From 11ac25c62cd2f3bb8da9e1df2e71afdebe76f093 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 5 Nov 2021 13:35:39 -0700
Subject: lib/stackdepot: introduce __stack_depot_save()

Add __stack_depot_save(), which provides more fine-grained control over
stackdepot's memory allocation behaviour, in case stackdepot runs out of
"stack slabs".

Normally stackdepot uses alloc_pages() in case it runs out of space;
passing can_alloc==false to __stack_depot_save() prohibits this, at the
cost of more likely failure to record a stack trace.

Link: https://lkml.kernel.org/r/20210913112609.2651084-4-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Tested-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Taras Madan <tarasmadan@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vijayanand Jitta <vjitta@codeaurora.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Walter Wu <walter-zh.wu@mediatek.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/stackdepot.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 97b36dc53301..b2f7e7c6ba54 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -15,6 +15,10 @@
 
 typedef u32 depot_stack_handle_t;
 
+depot_stack_handle_t __stack_depot_save(unsigned long *entries,
+					unsigned int nr_entries,
+					gfp_t gfp_flags, bool can_alloc);
+
 depot_stack_handle_t stack_depot_save(unsigned long *entries,
 				      unsigned int nr_entries, gfp_t gfp_flags);
 
-- 
cgit v1.2.3


From 7cb3007ce2da27ec02a1a3211941e7fe6875b642 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 5 Nov 2021 13:35:46 -0700
Subject: kasan: generic: introduce kasan_record_aux_stack_noalloc()

Introduce a variant of kasan_record_aux_stack() that does not do any
memory allocation through stackdepot.  This will permit using it in
contexts that cannot allocate any memory.

Link: https://lkml.kernel.org/r/20210913112609.2651084-6-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Tested-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Taras Madan <tarasmadan@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vijayanand Jitta <vjitta@codeaurora.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Walter Wu <walter-zh.wu@mediatek.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index dd874a1ee862..736d7b458996 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -370,12 +370,14 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
 void kasan_record_aux_stack(void *ptr);
+void kasan_record_aux_stack_noalloc(void *ptr);
 
 #else /* CONFIG_KASAN_GENERIC */
 
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
 static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
 static inline void kasan_record_aux_stack(void *ptr) {}
+static inline void kasan_record_aux_stack_noalloc(void *ptr) {}
 
 #endif /* CONFIG_KASAN_GENERIC */
 
-- 
cgit v1.2.3


From 86cffecdeaa278444870c8745ab166a65865dbf0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 5 Nov 2021 13:36:19 -0700
Subject: Compiler Attributes: add __alloc_size() for better bounds checking

GCC and Clang can use the "alloc_size" attribute to better inform the
results of __builtin_object_size() (for compile-time constant values).
Clang can additionally use alloc_size to inform the results of
__builtin_dynamic_object_size() (for run-time values).

Because GCC sees the frequent use of struct_size() as an allocator size
argument, and notices it can return SIZE_MAX (the overflow indication),
it complains about these call sites overflowing (since SIZE_MAX is
greater than the default -Walloc-size-larger-than=PTRDIFF_MAX).  This
isn't helpful since we already know a SIZE_MAX will be caught at
run-time (this was an intentional design).  To deal with this, we must
disable this check as it is both a false positive and redundant.  (Clang
does not have this warning option.)

Unfortunately, just checking the -Wno-alloc-size-larger-than is not
sufficient to make the __alloc_size attribute behave correctly under
older GCC versions.  The attribute itself must be disabled in those
situations too, as there appears to be no way to reliably silence the
SIZE_MAX constant expression cases for GCC versions less than 9.1:

   In file included from ./include/linux/resource_ext.h:11,
                    from ./include/linux/pci.h:40,
                    from drivers/net/ethernet/intel/ixgbe/ixgbe.h:9,
                    from drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c:4:
   In function 'kmalloc_node',
       inlined from 'ixgbe_alloc_q_vector' at ./include/linux/slab.h:743:9:
   ./include/linux/slab.h:618:9: error: argument 1 value '18446744073709551615' exceeds maximum object size 9223372036854775807 [-Werror=alloc-size-larger-than=]
     return __kmalloc_node(size, flags, node);
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   ./include/linux/slab.h: In function 'ixgbe_alloc_q_vector':
   ./include/linux/slab.h:455:7: note: in a call to allocation function '__kmalloc_node' declared here
    void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_slab_alignment __malloc;
          ^~~~~~~~~~~~~~

Specifically:
 '-Wno-alloc-size-larger-than' is not correctly handled by GCC < 9.1
    https://godbolt.org/z/hqsfG7q84 (doesn't disable)
    https://godbolt.org/z/P9jdrPTYh (doesn't admit to not knowing about option)
    https://godbolt.org/z/465TPMWKb (only warns when other warnings appear)

 '-Walloc-size-larger-than=18446744073709551615' is not handled by GCC < 8.2
    https://godbolt.org/z/73hh1EPxz (ignores numeric value)

Since anything marked with __alloc_size would also qualify for marking
with __malloc, just include __malloc along with it to avoid redundant
markings.  (Suggested by Linus Torvalds.)

Finally, make sure checkpatch.pl doesn't get confused about finding the
__alloc_size attribute on functions.  (Thanks to Joe Perches.)

Link: https://lkml.kernel.org/r/20210930222704.2631604-3-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Daniel Micay <danielmicay@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexandre Bounine <alex.bou9@gmail.com>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jing Xiangfeng <jingxiangfeng@huawei.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h        |  8 ++++++++
 include/linux/compiler_attributes.h | 10 ++++++++++
 include/linux/compiler_types.h      | 12 ++++++++++++
 3 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index bd2b881c6b63..b9d5f9c373a0 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -144,3 +144,11 @@
 #else
 #define __diag_GCC_8(s)
 #endif
+
+/*
+ * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size"
+ * attribute) do not work, and must be disabled.
+ */
+#if GCC_VERSION < 90100
+#undef __alloc_size__
+#endif
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index e6ec63403965..3de06a8fae73 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -33,6 +33,15 @@
 #define __aligned(x)                    __attribute__((__aligned__(x)))
 #define __aligned_largest               __attribute__((__aligned__))
 
+/*
+ * Note: do not use this directly. Instead, use __alloc_size() since it is conditionally
+ * available and includes other attributes.
+ *
+ *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alloc_005fsize-function-attribute
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#alloc-size
+ */
+#define __alloc_size__(x, ...)		__attribute__((__alloc_size__(x, ## __VA_ARGS__)))
+
 /*
  * Note: users of __always_inline currently do not write "inline" themselves,
  * which seems to be required by gcc to apply the attribute according
@@ -153,6 +162,7 @@
 
 /*
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-malloc-function-attribute
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#malloc
  */
 #define __malloc                        __attribute__((__malloc__))
 
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index b6ff83a714ca..4f2203c4a257 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -250,6 +250,18 @@ struct ftrace_likely_data {
 # define __cficanonical
 #endif
 
+/*
+ * Any place that could be marked with the "alloc_size" attribute is also
+ * a place to be marked with the "malloc" attribute. Do this as part of the
+ * __alloc_size macro to avoid redundant attributes and to avoid missing a
+ * __malloc marking.
+ */
+#ifdef __alloc_size__
+# define __alloc_size(x, ...)	__alloc_size__(x, ## __VA_ARGS__) __malloc
+#else
+# define __alloc_size(x, ...)	__malloc
+#endif
+
 #ifndef asm_volatile_goto
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
-- 
cgit v1.2.3


From 72d67229f522e3331d1eabd9f58d36ae080eb228 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 5 Nov 2021 13:36:23 -0700
Subject: slab: clean up function prototypes

Based on feedback from Joe Perches and Linus Torvalds, regularize the
slab function prototypes before making attribute changes.

Link: https://lkml.kernel.org/r/20210930222704.2631604-4-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexandre Bounine <alex.bou9@gmail.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Daniel Micay <danielmicay@gmail.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jing Xiangfeng <jingxiangfeng@huawei.com>
Cc: Joe Perches <joe@perches.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 68 ++++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index c0d46b6fa12a..d05de03bdcdd 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -152,8 +152,8 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
 			slab_flags_t flags,
 			unsigned int useroffset, unsigned int usersize,
 			void (*ctor)(void *));
-void kmem_cache_destroy(struct kmem_cache *);
-int kmem_cache_shrink(struct kmem_cache *);
+void kmem_cache_destroy(struct kmem_cache *s);
+int kmem_cache_shrink(struct kmem_cache *s);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
@@ -181,11 +181,11 @@ int kmem_cache_shrink(struct kmem_cache *);
 /*
  * Common kmalloc functions provided by all allocators
  */
-void * __must_check krealloc(const void *, size_t, gfp_t);
-void kfree(const void *);
-void kfree_sensitive(const void *);
-size_t __ksize(const void *);
-size_t ksize(const void *);
+void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags);
+void kfree(const void *objp);
+void kfree_sensitive(const void *objp);
+size_t __ksize(const void *objp);
+size_t ksize(const void *objp);
 #ifdef CONFIG_PRINTK
 bool kmem_valid_obj(void *object);
 void kmem_dump_obj(void *object);
@@ -426,8 +426,8 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
 #endif /* !CONFIG_SLOB */
 
 void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
-void kmem_cache_free(struct kmem_cache *, void *);
+void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc;
+void kmem_cache_free(struct kmem_cache *s, void *objp);
 
 /*
  * Bulk allocation and freeing operations. These are accelerated in an
@@ -436,8 +436,8 @@ void kmem_cache_free(struct kmem_cache *, void *);
  *
  * Note that interrupts must be enabled when calling these functions.
  */
-void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
-int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
 
 /*
  * Caller must not use kfree_bulk() on memory not originally allocated
@@ -450,7 +450,8 @@ static __always_inline void kfree_bulk(size_t size, void **p)
 
 #ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
+void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
+									 __malloc;
 #else
 static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
@@ -464,25 +465,24 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
 #endif
 
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
+extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
+				   __assume_slab_alignment __malloc;
 
 #ifdef CONFIG_NUMA
-extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
-					   gfp_t gfpflags,
-					   int node, size_t size) __assume_slab_alignment __malloc;
+extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+					 int node, size_t size) __assume_slab_alignment __malloc;
 #else
-static __always_inline void *
-kmem_cache_alloc_node_trace(struct kmem_cache *s,
-			      gfp_t gfpflags,
-			      int node, size_t size)
+static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
+							 gfp_t gfpflags, int node,
+							 size_t size)
 {
 	return kmem_cache_alloc_trace(s, gfpflags, size);
 }
 #endif /* CONFIG_NUMA */
 
 #else /* CONFIG_TRACING */
-static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
-		gfp_t flags, size_t size)
+static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags,
+						    size_t size)
 {
 	void *ret = kmem_cache_alloc(s, flags);
 
@@ -490,10 +490,8 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
 	return ret;
 }
 
-static __always_inline void *
-kmem_cache_alloc_node_trace(struct kmem_cache *s,
-			      gfp_t gfpflags,
-			      int node, size_t size)
+static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+							 int node, size_t size)
 {
 	void *ret = kmem_cache_alloc_node(s, gfpflags, node);
 
@@ -502,13 +500,14 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 }
 #endif /* CONFIG_TRACING */
 
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment
+									 __malloc;
 
 #ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+				__assume_page_alignment __malloc;
 #else
-static __always_inline void *
-kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+static __always_inline void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
 {
 	return kmalloc_order(size, flags, order);
 }
@@ -638,8 +637,8 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
  * @new_size: new size of a single member of the array
  * @flags: the type of memory to allocate (see kmalloc)
  */
-static __must_check inline void *
-krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags)
+static inline void * __must_check krealloc_array(void *p, size_t new_n, size_t new_size,
+						 gfp_t flags)
 {
 	size_t bytes;
 
@@ -668,7 +667,7 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
  * allocator where we care about the real place the memory allocation
  * request comes from.
  */
-extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
+extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller);
 #define kmalloc_track_caller(size, flags) \
 	__kmalloc_track_caller(size, flags, _RET_IP_)
 
@@ -691,7 +690,8 @@ static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
 
 
 #ifdef CONFIG_NUMA
-extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
+extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
+					 unsigned long caller);
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node_track_caller(size, flags, node, \
 			_RET_IP_)
-- 
cgit v1.2.3


From c37495d6254c237578db3121dcf79857e033f8ff Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 5 Nov 2021 13:36:27 -0700
Subject: slab: add __alloc_size attributes for better bounds checking

As already done in GrapheneOS, add the __alloc_size attribute for
regular kmalloc interfaces, to provide additional hinting for better
bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler
optimizations.

Link: https://lkml.kernel.org/r/20210930222704.2631604-5-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Daniel Micay <danielmicay@gmail.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Alexandre Bounine <alex.bou9@gmail.com>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jing Xiangfeng <jingxiangfeng@huawei.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 61 ++++++++++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index d05de03bdcdd..b5bf0537975b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -181,7 +181,7 @@ int kmem_cache_shrink(struct kmem_cache *s);
 /*
  * Common kmalloc functions provided by all allocators
  */
-void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags);
+void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2);
 void kfree(const void *objp);
 void kfree_sensitive(const void *objp);
 size_t __ksize(const void *objp);
@@ -425,7 +425,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
 #define kmalloc_index(s) __kmalloc_index(s, true)
 #endif /* !CONFIG_SLOB */
 
-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc;
 void kmem_cache_free(struct kmem_cache *s, void *objp);
 
@@ -449,11 +449,12 @@ static __always_inline void kfree_bulk(size_t size, void **p)
 }
 
 #ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
+							 __alloc_size(1);
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
 									 __malloc;
 #else
-static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return __kmalloc(size, flags);
 }
@@ -466,23 +467,23 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
 
 #ifdef CONFIG_TRACING
 extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
-				   __assume_slab_alignment __malloc;
+				   __assume_slab_alignment __alloc_size(3);
 
 #ifdef CONFIG_NUMA
 extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
-					 int node, size_t size) __assume_slab_alignment __malloc;
+					 int node, size_t size) __assume_slab_alignment
+								__alloc_size(4);
 #else
-static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
-							 gfp_t gfpflags, int node,
-							 size_t size)
+static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
+						 gfp_t gfpflags, int node, size_t size)
 {
 	return kmem_cache_alloc_trace(s, gfpflags, size);
 }
 #endif /* CONFIG_NUMA */
 
 #else /* CONFIG_TRACING */
-static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags,
-						    size_t size)
+static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s,
+								    gfp_t flags, size_t size)
 {
 	void *ret = kmem_cache_alloc(s, flags);
 
@@ -501,19 +502,20 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g
 #endif /* CONFIG_TRACING */
 
 extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment
-									 __malloc;
+									 __alloc_size(1);
 
 #ifdef CONFIG_TRACING
 extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
-				__assume_page_alignment __malloc;
+				__assume_page_alignment __alloc_size(1);
 #else
-static __always_inline void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, gfp_t flags,
+								 unsigned int order)
 {
 	return kmalloc_order(size, flags, order);
 }
 #endif
 
-static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
+static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t flags)
 {
 	unsigned int order = get_order(size);
 	return kmalloc_order_trace(size, flags, order);
@@ -573,7 +575,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
  *	Try really hard to succeed the allocation but fail
  *	eventually.
  */
-static __always_inline void *kmalloc(size_t size, gfp_t flags)
+static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
 #ifndef CONFIG_SLOB
@@ -595,7 +597,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 	return __kmalloc(size, flags);
 }
 
-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 #ifndef CONFIG_SLOB
 	if (__builtin_constant_p(size) &&
@@ -619,7 +621,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * @size: element size.
  * @flags: the type of memory to allocate (see kmalloc).
  */
-static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
+static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags)
 {
 	size_t bytes;
 
@@ -637,8 +639,10 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
  * @new_size: new size of a single member of the array
  * @flags: the type of memory to allocate (see kmalloc)
  */
-static inline void * __must_check krealloc_array(void *p, size_t new_n, size_t new_size,
-						 gfp_t flags)
+static inline __alloc_size(2, 3) void * __must_check krealloc_array(void *p,
+								    size_t new_n,
+								    size_t new_size,
+								    gfp_t flags)
 {
 	size_t bytes;
 
@@ -654,7 +658,7 @@ static inline void * __must_check krealloc_array(void *p, size_t new_n, size_t n
  * @size: element size.
  * @flags: the type of memory to allocate (see kmalloc).
  */
-static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
+static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags)
 {
 	return kmalloc_array(n, size, flags | __GFP_ZERO);
 }
@@ -667,12 +671,13 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
  * allocator where we care about the real place the memory allocation
  * request comes from.
  */
-extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller);
+extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
+				   __alloc_size(1);
 #define kmalloc_track_caller(size, flags) \
 	__kmalloc_track_caller(size, flags, _RET_IP_)
 
-static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
-				       int node)
+static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
+							  int node)
 {
 	size_t bytes;
 
@@ -683,7 +688,7 @@ static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
 	return __kmalloc_node(bytes, flags, node);
 }
 
-static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
+static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
 {
 	return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
 }
@@ -691,7 +696,7 @@ static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
 
 #ifdef CONFIG_NUMA
 extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
-					 unsigned long caller);
+					 unsigned long caller) __alloc_size(1);
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node_track_caller(size, flags, node, \
 			_RET_IP_)
@@ -716,7 +721,7 @@ static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate (see kmalloc).
  */
-static inline void *kzalloc(size_t size, gfp_t flags)
+static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags)
 {
 	return kmalloc(size, flags | __GFP_ZERO);
 }
@@ -727,7 +732,7 @@ static inline void *kzalloc(size_t size, gfp_t flags)
  * @flags: the type of memory to allocate (see kmalloc).
  * @node: memory node from which to allocate
  */
-static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
+static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node)
 {
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
-- 
cgit v1.2.3


From 56bcf40f91c7d7f53b77abe161a7d18ef9f981ba Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 5 Nov 2021 13:36:31 -0700
Subject: mm/kvmalloc: add __alloc_size attributes for better bounds checking

As already done in GrapheneOS, add the __alloc_size attribute for
regular kvmalloc interfaces, to provide additional hinting for better
bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler
optimizations.

Link: https://lkml.kernel.org/r/20210930222704.2631604-6-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Daniel Micay <danielmicay@gmail.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Alexandre Bounine <alex.bou9@gmail.com>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jing Xiangfeng <jingxiangfeng@huawei.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index b5bf0537975b..837cb16232ef 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -737,21 +737,21 @@ static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int n
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
-extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
-static inline void *kvmalloc(size_t size, gfp_t flags)
+extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1);
+static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags)
 {
 	return kvmalloc_node(size, flags, NUMA_NO_NODE);
 }
-static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
+static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node)
 {
 	return kvmalloc_node(size, flags | __GFP_ZERO, node);
 }
-static inline void *kvzalloc(size_t size, gfp_t flags)
+static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags)
 {
 	return kvmalloc(size, flags | __GFP_ZERO);
 }
 
-static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
 {
 	size_t bytes;
 
@@ -761,13 +761,13 @@ static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
 	return kvmalloc(bytes, flags);
 }
 
-static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
+static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags)
 {
 	return kvmalloc_array(n, size, flags | __GFP_ZERO);
 }
 
-extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
-		gfp_t flags);
+extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+		      __alloc_size(3);
 extern void kvfree(const void *addr);
 extern void kvfree_sensitive(const void *addr, size_t len);
 
-- 
cgit v1.2.3


From 894f24bb569afd4fe4a874c636f82d47f1c9beed Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 5 Nov 2021 13:36:34 -0700
Subject: mm/vmalloc: add __alloc_size attributes for better bounds checking

As already done in GrapheneOS, add the __alloc_size attribute for
appropriate vmalloc allocator interfaces, to provide additional hinting
for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other
compiler optimizations.

Link: https://lkml.kernel.org/r/20210930222704.2631604-7-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Daniel Micay <danielmicay@gmail.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexandre Bounine <alex.bou9@gmail.com>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jing Xiangfeng <jingxiangfeng@huawei.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 671d402c3778..0ed56fc10c11 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -136,21 +136,21 @@ static inline void vmalloc_init(void)
 static inline unsigned long vmalloc_nr_pages(void) { return 0; }
 #endif
 
-extern void *vmalloc(unsigned long size);
-extern void *vzalloc(unsigned long size);
-extern void *vmalloc_user(unsigned long size);
-extern void *vmalloc_node(unsigned long size, int node);
-extern void *vzalloc_node(unsigned long size, int node);
-extern void *vmalloc_32(unsigned long size);
-extern void *vmalloc_32_user(unsigned long size);
-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask);
+extern void *vmalloc(unsigned long size) __alloc_size(1);
+extern void *vzalloc(unsigned long size) __alloc_size(1);
+extern void *vmalloc_user(unsigned long size) __alloc_size(1);
+extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1);
+extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1);
+extern void *vmalloc_32(unsigned long size) __alloc_size(1);
+extern void *vmalloc_32_user(unsigned long size) __alloc_size(1);
+extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
 extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
 			pgprot_t prot, unsigned long vm_flags, int node,
-			const void *caller);
+			const void *caller) __alloc_size(1);
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
-		int node, const void *caller);
-void *vmalloc_no_huge(unsigned long size);
+		int node, const void *caller) __alloc_size(1);
+void *vmalloc_no_huge(unsigned long size) __alloc_size(1);
 
 extern void vfree(const void *addr);
 extern void vfree_atomic(const void *addr);
-- 
cgit v1.2.3


From abd58f38dfb4771ef06e25d71a84aa0932c52f1a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 5 Nov 2021 13:36:38 -0700
Subject: mm/page_alloc: add __alloc_size attributes for better bounds checking

As already done in GrapheneOS, add the __alloc_size attribute for
appropriate page allocator interfaces, to provide additional hinting for
better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other
compiler optimizations.

Link: https://lkml.kernel.org/r/20210930222704.2631604-8-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Daniel Micay <danielmicay@gmail.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexandre Bounine <alex.bou9@gmail.com>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jing Xiangfeng <jingxiangfeng@huawei.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 55b2ec1f965a..fbd4abc33f24 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -608,9 +608,9 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
 
-void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
+void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1);
 void free_pages_exact(void *virt, size_t size);
-void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
+__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(1);
 
 #define __get_free_page(gfp_mask) \
 		__get_free_pages((gfp_mask), 0)
-- 
cgit v1.2.3


From 17197dd460469c6fca66e274f5cd51ee43bb6ddd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 5 Nov 2021 13:36:42 -0700
Subject: percpu: add __alloc_size attributes for better bounds checking

As already done in GrapheneOS, add the __alloc_size attribute for
appropriate percpu allocator interfaces, to provide additional hinting
for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other
compiler optimizations.

Note that due to the implementation of the percpu API, this is unlikely
to ever actually provide compile-time checking beyond very simple
non-SMP builds.  But, since they are technically allocators, mark them
as such.

Link: https://lkml.kernel.org/r/20210930222704.2631604-9-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Daniel Micay <danielmicay@gmail.com>
Acked-by: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexandre Bounine <alex.bou9@gmail.com>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jing Xiangfeng <jingxiangfeng@huawei.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/percpu.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 5e76af742c80..98a9371133f8 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 #endif
 
-extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
+extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1);
 extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
 extern bool is_kernel_percpu_address(unsigned long addr);
 
@@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr);
 extern void __init setup_per_cpu_areas(void);
 #endif
 
-extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp);
-extern void __percpu *__alloc_percpu(size_t size, size_t align);
+extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
+extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
 extern void free_percpu(void __percpu *__pdata);
 extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
 
-- 
cgit v1.2.3


From 0b3ea0926afb8dde70cfab00316ae0a70b93a7cc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Nov 2021 13:36:58 -0700
Subject: fs: explicitly unregister per-superblock BDIs

Add a new SB_I_ flag to mark superblocks that have an ephemeral bdi
associated with them, and unregister it when the superblock is shut
down.

Link: https://lkml.kernel.org/r/20211021124441.668816-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Vignesh Raghavendra <vigneshr@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e7a633353fd2..226de651f52e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1443,6 +1443,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_UNTRUSTED_MOUNTER		0x00000040
 
 #define SB_I_SKIP_SYNC	0x00000100	/* Skip superblock at global sync */
+#define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
 
 /* Possible states of 'frozen' field */
 enum {
-- 
cgit v1.2.3


From efee17134ca464639a2f5b4d036ce40caf1b247a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Nov 2021 13:37:04 -0700
Subject: mm: simplify bdi refcounting

Move grabbing and releasing the bdi refcount out of the common
wb_init/wb_exit helpers into code that is only used for the non-default
memcg driven bdi_writeback structures.

[hch@lst.de: add comment]
  Link: https://lkml.kernel.org/r/20211027074207.GA12793@lst.de
[akpm@linux-foundation.org: fix typo]

Link: https://lkml.kernel.org/r/20211021124441.668816-6-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Vignesh Raghavendra <vigneshr@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev-defs.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 33207004cfde..993c5628a726 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -103,6 +103,9 @@ struct wb_completion {
  * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
  * is tested for blkcg after lookup and removed from index on mismatch so
  * that a new wb for the combination can be created.
+ *
+ * Each bdi_writeback that is not embedded into the backing_dev_info must hold
+ * a reference to the parent backing_dev_info.  See cgwb_create() for details.
  */
 struct bdi_writeback {
 	struct backing_dev_info *bdi;	/* our parent bdi */
-- 
cgit v1.2.3


From e80216d9f1f5c90b3cab834cd4fb492d731f70aa Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Fri, 5 Nov 2021 13:37:56 -0700
Subject: mm: memcontrol: remove the kmem states

Now the kmem states is only used to indicate whether the kmem is
offline.  However, we can set ->kmemcg_id to -1 to indicate whether the
kmem is offline.  Finally, we can remove the kmem states to simplify the
code.

Link: https://lkml.kernel.org/r/20211025125259.56624-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3096c9a0ee01..f207f98bdb76 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -180,12 +180,6 @@ struct mem_cgroup_thresholds {
 	struct mem_cgroup_threshold_ary *spare;
 };
 
-enum memcg_kmem_state {
-	KMEM_NONE,
-	KMEM_ALLOCATED,
-	KMEM_ONLINE,
-};
-
 #if defined(CONFIG_SMP)
 struct memcg_padding {
 	char x[0];
@@ -318,7 +312,6 @@ struct mem_cgroup {
 
 #ifdef CONFIG_MEMCG_KMEM
 	int kmemcg_id;
-	enum memcg_kmem_state kmem_state;
 	struct obj_cgroup __rcu *objcg;
 	struct list_head objcg_list; /* list of inherited objcgs */
 #endif
-- 
cgit v1.2.3


From f1dc0db296bd25960273649fc6ef2ecbf5aaa0e0 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eb@emlix.com>
Date: Fri, 5 Nov 2021 13:38:15 -0700
Subject: mm: use __pfn_to_section() instead of open coding it

It is defined in the same file just a few lines above.

Link: https://lkml.kernel.org/r/4598487.Rc0NezkW7i@mobilepool36.emlix.com
Signed-off-by: Rolf Eike Beer <eb@emlix.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6a1d79d84675..832aa49d0d8e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1481,7 +1481,7 @@ static inline int pfn_valid(unsigned long pfn)
 
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
-	ms = __nr_to_section(pfn_to_section_nr(pfn));
+	ms = __pfn_to_section(pfn);
 	if (!valid_section(ms))
 		return 0;
 	/*
@@ -1496,7 +1496,7 @@ static inline int pfn_in_present_section(unsigned long pfn)
 {
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
-	return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
+	return present_section(__pfn_to_section(pfn));
 }
 
 static inline unsigned long next_present_section_nr(unsigned long section_nr)
-- 
cgit v1.2.3


From 232a6a1c0619d1b4d9cd8d21949b2f13821be0af Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 5 Nov 2021 13:38:31 -0700
Subject: mm: drop first_index/last_index in zap_details

The first_index/last_index parameters in zap_details are actually only
used in unmap_mapping_range_tree().  At the meantime, this function is
only called by unmap_mapping_pages() once.

Instead of passing these two variables through the whole stack of page
zapping code, remove them from zap_details and let them simply be
parameters of unmap_mapping_range_tree(), which is inlined.

Link: https://lkml.kernel.org/r/20210915181535.11238-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Liam Howlett <liam.howlett@oracle.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 32b2ecc47408..c5b600d7f85b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1688,8 +1688,6 @@ extern void user_shm_unlock(size_t, struct ucounts *);
  */
 struct zap_details {
 	struct address_space *check_mapping;	/* Check page->mapping if set */
-	pgoff_t	first_index;			/* Lowest page->index to unmap */
-	pgoff_t last_index;			/* Highest page->index to unmap */
 	struct page *single_page;		/* Locked page to be unmapped */
 };
 
-- 
cgit v1.2.3


From 91b61ef333cf43f96b3522a086c9ac925763d6e5 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 5 Nov 2021 13:38:34 -0700
Subject: mm: add zap_skip_check_mapping() helper

Use the helper for the checks.  Rename "check_mapping" into
"zap_mapping" because "check_mapping" looks like a bool but in fact it
stores the mapping itself.  When it's set, we check the mapping (it must
be non-NULL).  When it's cleared we skip the check, which works like the
old way.

Move the duplicated comments to the helper too.

Link: https://lkml.kernel.org/r/20210915181538.11288-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c5b600d7f85b..6e29deb1e0d4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1687,10 +1687,24 @@ extern void user_shm_unlock(size_t, struct ucounts *);
  * Parameter block passed down to zap_pte_range in exceptional cases.
  */
 struct zap_details {
-	struct address_space *check_mapping;	/* Check page->mapping if set */
+	struct address_space *zap_mapping;	/* Check page->mapping if set */
 	struct page *single_page;		/* Locked page to be unmapped */
 };
 
+/*
+ * We set details->zap_mappings when we want to unmap shared but keep private
+ * pages. Return true if skip zapping this page, false otherwise.
+ */
+static inline bool
+zap_skip_check_mapping(struct zap_details *details, struct page *page)
+{
+	if (!details || !page)
+		return false;
+
+	return details->zap_mapping &&
+	    (details->zap_mapping != page_rmapping(page));
+}
+
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
-- 
cgit v1.2.3


From e26e0cc30b48cad5f2704f6a22fa5cac9fdaaece Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Fri, 5 Nov 2021 13:39:00 -0700
Subject: memory: remove unused CONFIG_MEM_BLOCK_SIZE

Commit 3947be1969a9 ("[PATCH] memory hotplug: sysfs and add/remove
functions") defines CONFIG_MEM_BLOCK_SIZE, but this has never been
utilized anywhere.

It is a good practice to keep the CONFIG_* defines exclusively for the
Kbuild system.  So, drop this unused definition.

This issue was noticed due to running ./scripts/checkkconfigsymbols.py.

Link: https://lkml.kernel.org/r/20211006120354.7468-1-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 182c606adb06..053a530c7bdd 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -140,7 +140,6 @@ typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
 extern int walk_memory_blocks(unsigned long start, unsigned long size,
 			      void *arg, walk_memory_blocks_func_t func);
 extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func);
-#define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 
 extern int memory_group_register_static(int nid, unsigned long max_pages);
 extern int memory_group_register_dynamic(int nid, unsigned long unit_pages);
-- 
cgit v1.2.3


From 2e86f78b117a019881a420705a9d0ef126253083 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Fri, 5 Nov 2021 13:39:10 -0700
Subject: include/linux/io-mapping.h: remove fallback for writecombine

The fallback was introduced in commit 80c33624e472 ("io-mapping: Fixup
for different names of writecombine") to fix the build on microblaze.

5 years later, it seems all archs now provide a pgprot_writecombine(),
so just remove the other possible fallbacks.  For microblaze,
pgprot_writecombine() is available since commit 97ccedd793ac
("microblaze: Provide pgprot_device/writecombine macros for nommu").

This is build-tested on microblaze with a hack to always build
mm/io-mapping.o and without DIYing on an x86-only macro
(_PAGE_CACHE_MASK)

Link: https://lkml.kernel.org/r/20211020204838.1142908-1-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/io-mapping.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index e9743cfd8585..66a774d2710e 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -132,13 +132,7 @@ io_mapping_init_wc(struct io_mapping *iomap,
 
 	iomap->base = base;
 	iomap->size = size;
-#if defined(pgprot_noncached_wc) /* archs can't agree on a name ... */
-	iomap->prot = pgprot_noncached_wc(PAGE_KERNEL);
-#elif defined(pgprot_writecombine)
 	iomap->prot = pgprot_writecombine(PAGE_KERNEL);
-#else
-	iomap->prot = pgprot_noncached(PAGE_KERNEL);
-#endif
 
 	return iomap;
 }
-- 
cgit v1.2.3


From bd1a8fb2d43f7c293383f76691d7a55f7f89d9da Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 5 Nov 2021 13:39:22 -0700
Subject: mm/vmalloc: don't allow VM_NO_GUARD on vmap()

The vmalloc guard pages are added on top of each allocation, thereby
isolating any two allocations from one another.  The top guard of the
lower allocation is the bottom guard guard of the higher allocation etc.

Therefore VM_NO_GUARD is dangerous; it breaks the basic premise of
isolating separate allocations.

There are only two in-tree users of this flag, neither of which use it
through the exported interface.  Ensure it stays this way.

Link: https://lkml.kernel.org/r/YUMfdA36fuyZ+/xt@hirez.programming.kicks-ass.net
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 0ed56fc10c11..6e022cc712e6 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -22,7 +22,7 @@ struct notifier_block;		/* in notifier.h */
 #define VM_USERMAP		0x00000008	/* suitable for remap_vmalloc_range */
 #define VM_DMA_COHERENT		0x00000010	/* dma_alloc_coherent */
 #define VM_UNINITIALIZED	0x00000020	/* vm_struct is not fully initialized */
-#define VM_NO_GUARD		0x00000040      /* don't add guard page */
+#define VM_NO_GUARD		0x00000040      /* ***DANGEROUS*** don't add guard page */
 #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
 #define VM_FLUSH_RESET_PERMS	0x00000100	/* reset direct map and flush TLB on unmap, can't be freed in atomic context */
 #define VM_MAP_PUT_PAGES	0x00000200	/* put pages and free array in vfree */
-- 
cgit v1.2.3


From 3252b1d8309ea42bc6329d9341072ecf1c9505c0 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 5 Nov 2021 13:39:47 -0700
Subject: kasan: arm64: fix pcpu_page_first_chunk crash with KASAN_VMALLOC

With KASAN_VMALLOC and NEED_PER_CPU_PAGE_FIRST_CHUNK the kernel crashes:

  Unable to handle kernel paging request at virtual address ffff7000028f2000
  ...
  swapper pgtable: 64k pages, 48-bit VAs, pgdp=0000000042440000
  [ffff7000028f2000] pgd=000000063e7c0003, p4d=000000063e7c0003, pud=000000063e7c0003, pmd=000000063e7b0003, pte=0000000000000000
  Internal error: Oops: 96000007 [#1] PREEMPT SMP
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted 5.13.0-rc4-00003-gc6e6e28f3f30-dirty #62
  Hardware name: linux,dummy-virt (DT)
  pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO BTYPE=--)
  pc : kasan_check_range+0x90/0x1a0
  lr : memcpy+0x88/0xf4
  sp : ffff80001378fe20
  ...
  Call trace:
   kasan_check_range+0x90/0x1a0
   pcpu_page_first_chunk+0x3f0/0x568
   setup_per_cpu_areas+0xb8/0x184
   start_kernel+0x8c/0x328

The vm area used in vm_area_register_early() has no kasan shadow memory,
Let's add a new kasan_populate_early_vm_area_shadow() function to
populate the vm area shadow memory to fix the issue.

[wangkefeng.wang@huawei.com: fix redefinition of 'kasan_populate_early_vm_area_shadow']
  Link: https://lkml.kernel.org/r/20211011123211.3936196-1-wangkefeng.wang@huawei.com

Link: https://lkml.kernel.org/r/20210910053354.26721-4-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Marco Elver <elver@google.com>		[KASAN]
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>	[KASAN]
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 736d7b458996..65487a3b2a71 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -436,6 +436,8 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
 			   unsigned long free_region_end);
 
+void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
+
 #else /* CONFIG_KASAN_VMALLOC */
 
 static inline int kasan_populate_vmalloc(unsigned long start,
@@ -453,6 +455,10 @@ static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long free_region_start,
 					 unsigned long free_region_end) {}
 
+static inline void kasan_populate_early_vm_area_shadow(void *start,
+						       unsigned long size)
+{ }
+
 #endif /* CONFIG_KASAN_VMALLOC */
 
 #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
-- 
cgit v1.2.3


From c00b6b9610991c042ff4c3153daaa3ea8522c210 Mon Sep 17 00:00:00 2001
From: Chen Wandun <chenwandun@huawei.com>
Date: Fri, 5 Nov 2021 13:39:53 -0700
Subject: mm/vmalloc: introduce alloc_pages_bulk_array_mempolicy to accelerate
 memory allocation

Commit ffb29b1c255a ("mm/vmalloc: fix numa spreading for large hash
tables") can cause significant performance regressions in some
situations as Andrew mentioned in [1].  The main situation is vmalloc,
vmalloc will allocate pages with NUMA_NO_NODE by default, that will
result in alloc page one by one;

In order to solve this, __alloc_pages_bulk and mempolicy should be
considered at the same time.

1) If node is specified in memory allocation request, it will alloc all
   pages by __alloc_pages_bulk.

2) If interleaving allocate memory, it will cauculate how many pages
   should be allocated in each node, and use __alloc_pages_bulk to alloc
   pages in each node.

[1]: https://lore.kernel.org/lkml/CALvZod4G3SzP3kWxQYn0fj+VgG-G3yWXz=gz17+3N57ru1iajw@mail.gmail.com/t/#m750c8e3231206134293b089feaa090590afa0f60

[akpm@linux-foundation.org: coding style fixes]
[akpm@linux-foundation.org: make two functions static]
[akpm@linux-foundation.org: fix CONFIG_NUMA=n build]

Link: https://lkml.kernel.org/r/20211021080744.874701-3-chenwandun@huawei.com
Signed-off-by: Chen Wandun <chenwandun@huawei.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index fbd4abc33f24..c1b262725dca 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -535,6 +535,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 				struct list_head *page_list,
 				struct page **page_array);
 
+unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
+				unsigned long nr_pages,
+				struct page **page_array);
+
 /* Bulk allocate order-0 pages */
 static inline unsigned long
 alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
-- 
cgit v1.2.3


From 8ca1b5a49885f0c0c486544da46a9e0ac790831d Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Fri, 5 Nov 2021 13:40:34 -0700
Subject: mm/page_alloc: detect allocation forbidden by cpuset and bail out
 early

There was a report that starting an Ubuntu in docker while using cpuset
to bind it to movable nodes (a node only has movable zone, like a node
for hotplug or a Persistent Memory node in normal usage) will fail due
to memory allocation failure, and then OOM is involved and many other
innocent processes got killed.

It can be reproduced with command:

    $ docker run -it --rm --cpuset-mems 4 ubuntu:latest bash -c "grep Mems_allowed /proc/self/status"

(where node 4 is a movable node)

  runc:[2:INIT] invoked oom-killer: gfp_mask=0x500cc2(GFP_HIGHUSER|__GFP_ACCOUNT), order=0, oom_score_adj=0
  CPU: 8 PID: 8291 Comm: runc:[2:INIT] Tainted: G        W I E     5.8.2-0.g71b519a-default #1 openSUSE Tumbleweed (unreleased)
  Hardware name: Dell Inc. PowerEdge R640/0PHYDR, BIOS 2.6.4 04/09/2020
  Call Trace:
   dump_stack+0x6b/0x88
   dump_header+0x4a/0x1e2
   oom_kill_process.cold+0xb/0x10
   out_of_memory.part.0+0xaf/0x230
   out_of_memory+0x3d/0x80
   __alloc_pages_slowpath.constprop.0+0x954/0xa20
   __alloc_pages_nodemask+0x2d3/0x300
   pipe_write+0x322/0x590
   new_sync_write+0x196/0x1b0
   vfs_write+0x1c3/0x1f0
   ksys_write+0xa7/0xe0
   do_syscall_64+0x52/0xd0
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

  Mem-Info:
  active_anon:392832 inactive_anon:182 isolated_anon:0
   active_file:68130 inactive_file:151527 isolated_file:0
   unevictable:2701 dirty:0 writeback:7
   slab_reclaimable:51418 slab_unreclaimable:116300
   mapped:45825 shmem:735 pagetables:2540 bounce:0
   free:159849484 free_pcp:73 free_cma:0
  Node 4 active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB all_unreclaimable? no
  Node 4 Movable free:130021408kB min:9140kB low:139160kB high:269180kB reserved_highatomic:0KB active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:130023424kB managed:130023424kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:292kB local_pcp:84kB free_cma:0kB
  lowmem_reserve[]: 0 0 0 0 0
  Node 4 Movable: 1*4kB (M) 0*8kB 0*16kB 1*32kB (M) 0*64kB 0*128kB 1*256kB (M) 1*512kB (M) 1*1024kB (M) 0*2048kB 31743*4096kB (M) = 130021156kB

  oom-kill:constraint=CONSTRAINT_CPUSET,nodemask=(null),cpuset=docker-9976a269caec812c134fa317f27487ee36e1129beba7278a463dd53e5fb9997b.scope,mems_allowed=4,global_oom,task_memcg=/system.slice/containerd.service,task=containerd,pid=4100,uid=0
  Out of memory: Killed process 4100 (containerd) total-vm:4077036kB, anon-rss:51184kB, file-rss:26016kB, shmem-rss:0kB, UID:0 pgtables:676kB oom_score_adj:0
  oom_reaper: reaped process 8248 (docker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
  oom_reaper: reaped process 2054 (node_exporter), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
  oom_reaper: reaped process 1452 (systemd-journal), now anon-rss:0kB, file-rss:8564kB, shmem-rss:4kB
  oom_reaper: reaped process 2146 (munin-node), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
  oom_reaper: reaped process 8291 (runc:[2:INIT]), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB

The reason is that in this case, the target cpuset nodes only have
movable zone, while the creation of an OS in docker sometimes needs to
allocate memory in non-movable zones (dma/dma32/normal) like
GFP_HIGHUSER, and the cpuset limit forbids the allocation, then
out-of-memory killing is involved even when normal nodes and movable
nodes both have many free memory.

The OOM killer cannot help to resolve the situation as there is no
usable memory for the request in the cpuset scope.  The only reasonable
measure to take is to fail the allocation right away and have the caller
to deal with it.

So add a check for cases like this in the slowpath of allocation, and
bail out early returning NULL for the allocation.

As page allocation is one of the hottest path in kernel, this check will
hurt all users with sane cpuset configuration, add a static branch check
and detect the abnormal config in cpuset memory binding setup so that
the extra check cost in page allocation is not paid by everyone.

[thanks to Micho Hocko and David Rientjes for suggesting not handling
 it inside OOM code, adding cpuset check, refining comments]

Link: https://lkml.kernel.org/r/1632481657-68112-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 17 +++++++++++++++++
 include/linux/mmzone.h | 22 ++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index d2b9c41c8edf..d58e0476ee8e 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -34,6 +34,8 @@
  */
 extern struct static_key_false cpusets_pre_enable_key;
 extern struct static_key_false cpusets_enabled_key;
+extern struct static_key_false cpusets_insane_config_key;
+
 static inline bool cpusets_enabled(void)
 {
 	return static_branch_unlikely(&cpusets_enabled_key);
@@ -51,6 +53,19 @@ static inline void cpuset_dec(void)
 	static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
 }
 
+/*
+ * This will get enabled whenever a cpuset configuration is considered
+ * unsupportable in general. E.g. movable only node which cannot satisfy
+ * any non movable allocations (see update_nodemask). Page allocator
+ * needs to make additional checks for those configurations and this
+ * check is meant to guard those checks without any overhead for sane
+ * configurations.
+ */
+static inline bool cpusets_insane_config(void)
+{
+	return static_branch_unlikely(&cpusets_insane_config_key);
+}
+
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_force_rebuild(void);
@@ -167,6 +182,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 
 static inline bool cpusets_enabled(void) { return false; }
 
+static inline bool cpusets_insane_config(void) { return false; }
+
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 832aa49d0d8e..fb36a29e3aae 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1220,6 +1220,28 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 #define for_each_zone_zonelist(zone, z, zlist, highidx) \
 	for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
 
+/* Whether the 'nodes' are all movable nodes */
+static inline bool movable_only_nodes(nodemask_t *nodes)
+{
+	struct zonelist *zonelist;
+	struct zoneref *z;
+	int nid;
+
+	if (nodes_empty(*nodes))
+		return false;
+
+	/*
+	 * We can chose arbitrary node from the nodemask to get a
+	 * zonelist as they are interlinked. We just need to find
+	 * at least one zone that can satisfy kernel allocations.
+	 */
+	nid = first_node(*nodes);
+	zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
+	z = first_zones_zonelist(zonelist, ZONE_NORMAL,	nodes);
+	return (!z->zone) ? true : false;
+}
+
+
 #ifdef CONFIG_SPARSEMEM
 #include <asm/sparsemem.h>
 #endif
-- 
cgit v1.2.3


From d2635f2012a44e3d469ab9a4022162dbe0e53f21 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Fri, 5 Nov 2021 13:40:40 -0700
Subject: mm: create a new system state and fix core_kernel_text()

core_kernel_text() considers that until system_state in at least
SYSTEM_RUNNING, init memory is valid.

But init memory is freed a few lines before setting SYSTEM_RUNNING, so
we have a small period of time when core_kernel_text() is wrong.

Create an intermediate system state called SYSTEM_FREEING_INIT that is
set before starting freeing init memory, and use it in
core_kernel_text() to report init memory invalid earlier.

Link: https://lkml.kernel.org/r/9ecfdee7dd4d741d172cb93ff1d87f1c58127c9a.1633001016.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2776423a587e..471bc0593679 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -248,6 +248,7 @@ extern bool early_boot_irqs_disabled;
 extern enum system_states {
 	SYSTEM_BOOTING,
 	SYSTEM_SCHEDULING,
+	SYSTEM_FREEING_INITMEM,
 	SYSTEM_RUNNING,
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
-- 
cgit v1.2.3


From 477d01fce8dacb41cae9363136ea56812e8baa59 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 5 Nov 2021 13:40:58 -0700
Subject: mm: fix data race in PagePoisoned()

PagePoisoned() accesses page->flags which can be updated concurrently:

  | BUG: KCSAN: data-race in next_uptodate_page / unlock_page
  |
  | write (marked) to 0xffffea00050f37c0 of 8 bytes by task 1872 on cpu 1:
  |  instrument_atomic_write           include/linux/instrumented.h:87 [inline]
  |  clear_bit_unlock_is_negative_byte include/asm-generic/bitops/instrumented-lock.h:74 [inline]
  |  unlock_page+0x102/0x1b0           mm/filemap.c:1465
  |  filemap_map_pages+0x6c6/0x890     mm/filemap.c:3057
  |  ...
  | read to 0xffffea00050f37c0 of 8 bytes by task 1873 on cpu 0:
  |  PagePoisoned                   include/linux/page-flags.h:204 [inline]
  |  PageReadahead                  include/linux/page-flags.h:382 [inline]
  |  next_uptodate_page+0x456/0x830 mm/filemap.c:2975
  |  ...
  | CPU: 0 PID: 1873 Comm: systemd-udevd Not tainted 5.11.0-rc4-00001-gf9ce0be71d1f #1

To avoid the compiler tearing or otherwise optimizing the access, use
READ_ONCE() to access flags.

Link: https://lore.kernel.org/all/20210826144157.GA26950@xsang-OptiPlex-9020/
Link: https://lkml.kernel.org/r/20210913113542.2658064-1-elver@google.com
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Will Deacon <will@kernel.org>
Cc: Marco Elver <elver@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index fbfd3fad48f2..dd80cf0bb579 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -215,7 +215,7 @@ static __always_inline int PageCompound(struct page *page)
 #define	PAGE_POISON_PATTERN	-1l
 static inline int PagePoisoned(const struct page *page)
 {
-	return page->flags == PAGE_POISON_PATTERN;
+	return READ_ONCE(page->flags) == PAGE_POISON_PATTERN;
 }
 
 #ifdef CONFIG_DEBUG_VM
-- 
cgit v1.2.3


From 73c54763482b841d9c14bad87eec98f80f700e0b Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 5 Nov 2021 13:41:17 -0700
Subject: mm/hugetlb: drop __unmap_hugepage_range definition from hugetlb.h

Remove __unmap_hugepage_range() from the header file, because it is only
used in hugetlb.c.

Link: https://lkml.kernel.org/r/20210917165108.9341-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1faebe1cd0ed..3cbf60464398 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -143,9 +143,6 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 			  struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end,
 			  struct page *ref_page);
-void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
-				unsigned long start, unsigned long end,
-				struct page *ref_page);
 void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(char *buf, int len, int nid);
 void hugetlb_show_meminfo(void);
@@ -385,13 +382,6 @@ static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 	BUG();
 }
 
-static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
-			struct vm_area_struct *vma, unsigned long start,
-			unsigned long end, struct page *ref_page)
-{
-	BUG();
-}
-
 static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
 			struct vm_area_struct *vma, unsigned long address,
 			unsigned int flags)
-- 
cgit v1.2.3


From 79dfc695525f60c8410015362b8aa4eb5c57210c Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Fri, 5 Nov 2021 13:41:20 -0700
Subject: hugetlb: add demote hugetlb page sysfs interfaces

Patch series "hugetlb: add demote/split page functionality", v4.

The concurrent use of multiple hugetlb page sizes on a single system is
becoming more common.  One of the reasons is better TLB support for
gigantic page sizes on x86 hardware.  In addition, hugetlb pages are
being used to back VMs in hosting environments.

When using hugetlb pages to back VMs, it is often desirable to
preallocate hugetlb pools.  This avoids the delay and uncertainty of
allocating hugetlb pages at VM startup.  In addition, preallocating huge
pages minimizes the issue of memory fragmentation that increases the
longer the system is up and running.

In such environments, a combination of larger and smaller hugetlb pages
are preallocated in anticipation of backing VMs of various sizes.  Over
time, the preallocated pool of smaller hugetlb pages may become depleted
while larger hugetlb pages still remain.  In such situations, it is
desirable to convert larger hugetlb pages to smaller hugetlb pages.

Converting larger to smaller hugetlb pages can be accomplished today by
first freeing the larger page to the buddy allocator and then allocating
the smaller pages.  For example, to convert 50 GB pages on x86:

  gb_pages=`cat .../hugepages-1048576kB/nr_hugepages`
  m2_pages=`cat .../hugepages-2048kB/nr_hugepages`
  echo $(($gb_pages - 50)) > .../hugepages-1048576kB/nr_hugepages
  echo $(($m2_pages + 25600)) > .../hugepages-2048kB/nr_hugepages

On an idle system this operation is fairly reliable and results are as
expected.  The number of 2MB pages is increased as expected and the time
of the operation is a second or two.

However, when there is activity on the system the following issues
arise:

1) This process can take quite some time, especially if allocation of
   the smaller pages is not immediate and requires migration/compaction.

2) There is no guarantee that the total size of smaller pages allocated
   will match the size of the larger page which was freed. This is
   because the area freed by the larger page could quickly be
   fragmented.

In a test environment with a load that continually fills the page cache
with clean pages, results such as the following can be observed:

  Unexpected number of 2MB pages allocated: Expected 25600, have 19944
  real    0m42.092s
  user    0m0.008s
  sys     0m41.467s

To address these issues, introduce the concept of hugetlb page demotion.
Demotion provides a means of 'in place' splitting of a hugetlb page to
pages of a smaller size.  This avoids freeing pages to buddy and then
trying to allocate from buddy.

Page demotion is controlled via sysfs files that reside in the per-hugetlb
page size and per node directories.

 - demote_size
        Target page size for demotion, a smaller huge page size. File
        can be written to chose a smaller huge page size if multiple are
        available.

 - demote
        Writable number of hugetlb pages to be demoted

To demote 50 GB huge pages, one would:

  cat .../hugepages-1048576kB/free_hugepages   /* optional, verify free pages */
  cat .../hugepages-1048576kB/demote_size      /* optional, verify target size */
  echo 50 > .../hugepages-1048576kB/demote

Only hugetlb pages which are free at the time of the request can be
demoted.  Demotion does not add to the complexity of surplus pages and
honors reserved huge pages.  Therefore, when a value is written to the
sysfs demote file, that value is only the maximum number of pages which
will be demoted.  It is possible fewer will actually be demoted.  The
recently introduced per-hstate mutex is used to synchronize demote
operations with other operations that modify hugetlb pools.

Real world use cases
--------------------
The above scenario describes a real world use case where hugetlb pages
are used to back VMs on x86.  Both issues of long allocation times and
not necessarily getting the expected number of smaller huge pages after
a free and allocate cycle have been experienced.  The occurrence of
these issues is dependent on other activity within the host and can not
be predicted.

This patch (of 5):

Two new sysfs files are added to demote hugtlb pages.  These files are
both per-hugetlb page size and per node.  Files are:

  demote_size - The size in Kb that pages are demoted to. (read-write)
  demote - The number of huge pages to demote. (write-only)

By default, demote_size is the next smallest huge page size.  Valid huge
page sizes less than huge page size may be written to this file.  When
huge pages are demoted, they are demoted to this size.

Writing a value to demote will result in an attempt to demote that
number of hugetlb pages to an appropriate number of demote_size pages.

NOTE: Demote interfaces are only provided for huge page sizes if there
is a smaller target demote huge page size.  For example, on x86 1GB huge
pages will have demote interfaces.  2MB huge pages will not have demote
interfaces.

This patch does not provide full demote functionality.  It only provides
the sysfs interfaces.

It also provides documentation for the new interfaces.

[mike.kravetz@oracle.com: n_mask initialization does not need to be protected by the mutex]
  Link: https://lkml.kernel.org/r/0530e4ef-2492-5186-f919-5db68edea654@oracle.com

Link: https://lkml.kernel.org/r/20211007181918.136982-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: David Rientjes <rientjes@google.com>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
Cc: Nghia Le <nghialm78@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3cbf60464398..2bddd6c38204 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -586,6 +586,7 @@ struct hstate {
 	int next_nid_to_alloc;
 	int next_nid_to_free;
 	unsigned int order;
+	unsigned int demote_order;
 	unsigned long mask;
 	unsigned long max_huge_pages;
 	unsigned long nr_huge_pages;
-- 
cgit v1.2.3


From 9871e2ded6c1ff61a59988d7a0e975f012105d52 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Fri, 5 Nov 2021 13:41:23 -0700
Subject: mm/cma: add cma_pages_valid to determine if pages are in CMA

Add new interface cma_pages_valid() which indicates if the specified
pages are part of a CMA region.  This interface will be used in a
subsequent patch by hugetlb code.

In order to keep the same amount of DEBUG information, a pr_debug() call
was added to cma_pages_valid().  In the case where the page passed to
cma_release is not in cma region, the debug message will be printed from
cma_pages_valid as opposed to cma_release.

Link: https://lkml.kernel.org/r/20211007181918.136982-3-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Nghia Le <nghialm78@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cma.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 53fd8c3cdbd0..bd801023504b 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -46,6 +46,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 					struct cma **res_cma);
 extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align,
 			      bool no_warn);
+extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count);
 extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count);
 
 extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
-- 
cgit v1.2.3


From bd3400ea173fb611cdf2030d03620185ff6c0b0e Mon Sep 17 00:00:00 2001
From: Liangcai Fan <liangcaifan19@gmail.com>
Date: Fri, 5 Nov 2021 13:41:36 -0700
Subject: mm: khugepaged: recalculate min_free_kbytes after stopping khugepaged

When initializing transparent huge pages, min_free_kbytes would be
calculated according to what khugepaged expected.

So when transparent huge pages get disabled, min_free_kbytes should be
recalculated instead of the higher value set by khugepaged.

Link: https://lkml.kernel.org/r/1633937809-16558-1-git-send-email-liangcaifan19@gmail.com
Signed-off-by: Liangcai Fan <liangcaifan19@gmail.com>
Signed-off-by: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6e29deb1e0d4..7e37c726b9db 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2453,6 +2453,7 @@ extern void memmap_init_range(unsigned long, int, unsigned long,
 		unsigned long, unsigned long, enum meminit_context,
 		struct vmem_altmap *, int migratetype);
 extern void setup_per_zone_wmarks(void);
+extern void calculate_min_free_kbytes(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
 extern void __init mmap_init(void);
-- 
cgit v1.2.3


From 550a7d60bd5e35a56942dba6d8a26752beb26c9f Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Fri, 5 Nov 2021 13:41:40 -0700
Subject: mm, hugepages: add mremap() support for hugepage backed vma

Support mremap() for hugepage backed vma segment by simply repositioning
page table entries.  The page table entries are repositioned to the new
virtual address on mremap().

Hugetlb mremap() support is of course generic; my motivating use case is
a library (hugepage_text), which reloads the ELF text of executables in
hugepages.  This significantly increases the execution performance of
said executables.

Restrict the mremap operation on hugepages to up to the size of the
original mapping as the underlying hugetlb reservation is not yet
capable of handling remapping to a larger size.

During the mremap() operation we detect pmd_share'd mappings and we
unshare those during the mremap().  On access and fault the sharing is
established again.

Link: https://lkml.kernel.org/r/20211013195825.3058275-1-almasrymina@google.com
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Ken Chen <kenchen@google.com>
Cc: Chris Kennelly <ckennelly@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Kirill Shutemov <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2bddd6c38204..c0a20781b28e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -124,6 +124,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
 void hugepage_put_subpool(struct hugepage_subpool *spool);
 
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
@@ -132,6 +133,10 @@ int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
 
+int move_hugetlb_page_tables(struct vm_area_struct *vma,
+			     struct vm_area_struct *new_vma,
+			     unsigned long old_addr, unsigned long new_addr,
+			     unsigned long len);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			 struct page **, struct vm_area_struct **,
@@ -215,6 +220,10 @@ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
 }
 
+static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+}
+
 static inline unsigned long hugetlb_total_pages(void)
 {
 	return 0;
@@ -262,6 +271,16 @@ static inline int copy_hugetlb_page_range(struct mm_struct *dst,
 	return 0;
 }
 
+static inline int move_hugetlb_page_tables(struct vm_area_struct *vma,
+					   struct vm_area_struct *new_vma,
+					   unsigned long old_addr,
+					   unsigned long new_addr,
+					   unsigned long len)
+{
+	BUG();
+	return 0;
+}
+
 static inline void hugetlb_report_meminfo(struct seq_file *m)
 {
 }
-- 
cgit v1.2.3


From 8cd7c588decf470bf7e14f2be93b709f839a965e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 5 Nov 2021 13:42:25 -0700
Subject: mm/vmscan: throttle reclaim until some writeback completes if
 congested

Patch series "Remove dependency on congestion_wait in mm/", v5.

This series that removes all calls to congestion_wait in mm/ and deletes
wait_iff_congested.  It's not a clever implementation but
congestion_wait has been broken for a long time [1].

Even if congestion throttling worked, it was never a great idea.  While
excessive dirty/writeback pages at the tail of the LRU is one
possibility that reclaim may be slow, there is also the problem of too
many pages being isolated and reclaim failing for other reasons
(elevated references, too many pages isolated, excessive LRU contention
etc).

This series replaces the "congestion" throttling with 3 different types.

 - If there are too many dirty/writeback pages, sleep until a timeout or
   enough pages get cleaned

 - If too many pages are isolated, sleep until enough isolated pages are
   either reclaimed or put back on the LRU

 - If no progress is being made, direct reclaim tasks sleep until
   another task makes progress with acceptable efficiency.

This was initially tested with a mix of workloads that used to trigger
corner cases that no longer work.  A new test case was created called
"stutterp" (pagereclaim-stutterp-noreaders in mmtests) using a freshly
created XFS filesystem.  Note that it may be necessary to increase the
timeout of ssh if executing remotely as ssh itself can get throttled and
the connection may timeout.

stutterp varies the number of "worker" processes from 4 up to NR_CPUS*4
to check the impact as the number of direct reclaimers increase.  It has
four types of worker.

 - One "anon latency" worker creates small mappings with mmap() and
   times how long it takes to fault the mapping reading it 4K at a time

 - X file writers which is fio randomly writing X files where the total
   size of the files add up to the allowed dirty_ratio. fio is allowed
   to run for a warmup period to allow some file-backed pages to
   accumulate. The duration of the warmup is based on the best-case
   linear write speed of the storage.

 - Y file readers which is fio randomly reading small files

 - Z anon memory hogs which continually map (100-dirty_ratio)% of memory

 - Total estimated WSS = (100+dirty_ration) percentage of memory

X+Y+Z+1 == NR_WORKERS varying from 4 up to NR_CPUS*4

The intent is to maximise the total WSS with a mix of file and anon
memory where some anonymous memory must be swapped and there is a high
likelihood of dirty/writeback pages reaching the end of the LRU.

The test can be configured to have no background readers to stress
dirty/writeback pages.  The results below are based on having zero
readers.

The short summary of the results is that the series works and stalls
until some event occurs but the timeouts may need adjustment.

The test results are not broken down by patch as the series should be
treated as one block that replaces a broken throttling mechanism with a
working one.

Finally, three machines were tested but I'm reporting the worst set of
results.  The other two machines had much better latencies for example.

First the results of the "anon latency" latency

  stutterp
                                5.15.0-rc1             5.15.0-rc1
                                   vanilla mm-reclaimcongest-v5r4
  Amean     mmap-4      31.4003 (   0.00%)   2661.0198 (-8374.52%)
  Amean     mmap-7      38.1641 (   0.00%)    149.2891 (-291.18%)
  Amean     mmap-12     60.0981 (   0.00%)    187.8105 (-212.51%)
  Amean     mmap-21    161.2699 (   0.00%)    213.9107 ( -32.64%)
  Amean     mmap-30    174.5589 (   0.00%)    377.7548 (-116.41%)
  Amean     mmap-48   8106.8160 (   0.00%)   1070.5616 (  86.79%)
  Stddev    mmap-4      41.3455 (   0.00%)  27573.9676 (-66591.66%)
  Stddev    mmap-7      53.5556 (   0.00%)   4608.5860 (-8505.23%)
  Stddev    mmap-12    171.3897 (   0.00%)   5559.4542 (-3143.75%)
  Stddev    mmap-21   1506.6752 (   0.00%)   5746.2507 (-281.39%)
  Stddev    mmap-30    557.5806 (   0.00%)   7678.1624 (-1277.05%)
  Stddev    mmap-48  61681.5718 (   0.00%)  14507.2830 (  76.48%)
  Max-90    mmap-4      31.4243 (   0.00%)     83.1457 (-164.59%)
  Max-90    mmap-7      41.0410 (   0.00%)     41.0720 (  -0.08%)
  Max-90    mmap-12     66.5255 (   0.00%)     53.9073 (  18.97%)
  Max-90    mmap-21    146.7479 (   0.00%)    105.9540 (  27.80%)
  Max-90    mmap-30    193.9513 (   0.00%)     64.3067 (  66.84%)
  Max-90    mmap-48    277.9137 (   0.00%)    591.0594 (-112.68%)
  Max       mmap-4    1913.8009 (   0.00%) 299623.9695 (-15555.96%)
  Max       mmap-7    2423.9665 (   0.00%) 204453.1708 (-8334.65%)
  Max       mmap-12   6845.6573 (   0.00%) 221090.3366 (-3129.64%)
  Max       mmap-21  56278.6508 (   0.00%) 213877.3496 (-280.03%)
  Max       mmap-30  19716.2990 (   0.00%) 216287.6229 (-997.00%)
  Max       mmap-48 477923.9400 (   0.00%) 245414.8238 (  48.65%)

For most thread counts, the time to mmap() is unfortunately increased.
In earlier versions of the series, this was lower but a large number of
throttling events were reaching their timeout increasing the amount of
inefficient scanning of the LRU.  There is no prioritisation of reclaim
tasks making progress based on each tasks rate of page allocation versus
progress of reclaim.  The variance is also impacted for high worker
counts but in all cases, the differences in latency are not
statistically significant due to very large maximum outliers.  Max-90
shows that 90% of the stalls are comparable but the Max results show the
massive outliers which are increased to to stalling.

It is expected that this will be very machine dependant.  Due to the
test design, reclaim is difficult so allocations stall and there are
variances depending on whether THPs can be allocated or not.  The amount
of memory will affect exactly how bad the corner cases are and how often
they trigger.  The warmup period calculation is not ideal as it's based
on linear writes where as fio is randomly writing multiple files from
multiple tasks so the start state of the test is variable.  For example,
these are the latencies on a single-socket machine that had more memory

  Amean     mmap-4      42.2287 (   0.00%)     49.6838 * -17.65%*
  Amean     mmap-7     216.4326 (   0.00%)     47.4451 *  78.08%*
  Amean     mmap-12   2412.0588 (   0.00%)     51.7497 (  97.85%)
  Amean     mmap-21   5546.2548 (   0.00%)     51.8862 (  99.06%)
  Amean     mmap-30   1085.3121 (   0.00%)     72.1004 (  93.36%)

The overall system CPU usage and elapsed time is as follows

                    5.15.0-rc3  5.15.0-rc3
                       vanilla mm-reclaimcongest-v5r4
  Duration User        6989.03      983.42
  Duration System      7308.12      799.68
  Duration Elapsed     2277.67     2092.98

The patches reduce system CPU usage by 89% as the vanilla kernel is rarely
stalling.

The high-level /proc/vmstats show

                                       5.15.0-rc1     5.15.0-rc1
                                          vanilla mm-reclaimcongest-v5r2
  Ops Direct pages scanned          1056608451.00   503594991.00
  Ops Kswapd pages scanned           109795048.00   147289810.00
  Ops Kswapd pages reclaimed          63269243.00    31036005.00
  Ops Direct pages reclaimed          10803973.00     6328887.00
  Ops Kswapd efficiency %                   57.62          21.07
  Ops Kswapd velocity                    48204.98       57572.86
  Ops Direct efficiency %                    1.02           1.26
  Ops Direct velocity                   463898.83      196845.97

Kswapd scanned less pages but the detailed pattern is different.  The
vanilla kernel scans slowly over time where as the patches exhibits
burst patterns of scan activity.  Direct reclaim scanning is reduced by
52% due to stalling.

The pattern for stealing pages is also slightly different.  Both kernels
exhibit spikes but the vanilla kernel when reclaiming shows pages being
reclaimed over a period of time where as the patches tend to reclaim in
spikes.  The difference is that vanilla is not throttling and instead
scanning constantly finding some pages over time where as the patched
kernel throttles and reclaims in spikes.

  Ops Percentage direct scans               90.59          77.37

For direct reclaim, vanilla scanned 90.59% of pages where as with the
patches, 77.37% were direct reclaim due to throttling

  Ops Page writes by reclaim           2613590.00     1687131.00

Page writes from reclaim context are reduced.

  Ops Page writes anon                 2932752.00     1917048.00

And there is less swapping.

  Ops Page reclaim immediate         996248528.00   107664764.00

The number of pages encountered at the tail of the LRU tagged for
immediate reclaim but still dirty/writeback is reduced by 89%.

  Ops Slabs scanned                     164284.00      153608.00

Slab scan activity is similar.

ftrace was used to gather stall activity

  Vanilla
  -------
      1 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=16000
      2 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=12000
      8 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=8000
     29 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=4000
  82394 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=0

The fast majority of wait_iff_congested calls do not stall at all.  What
is likely happening is that cond_resched() reschedules the task for a
short period when the BDI is not registering congestion (which it never
will in this test setup).

      1 writeback_congestion_wait: usec_timeout=100000 usec_delayed=120000
      2 writeback_congestion_wait: usec_timeout=100000 usec_delayed=132000
      4 writeback_congestion_wait: usec_timeout=100000 usec_delayed=112000
    380 writeback_congestion_wait: usec_timeout=100000 usec_delayed=108000
    778 writeback_congestion_wait: usec_timeout=100000 usec_delayed=104000

congestion_wait if called always exceeds the timeout as there is no
trigger to wake it up.

Bottom line: Vanilla will throttle but it's not effective.

Patch series
------------

Kswapd throttle activity was always due to scanning pages tagged for
immediate reclaim at the tail of the LRU

      1 usec_timeout=100000 usect_delayed=72000 reason=VMSCAN_THROTTLE_WRITEBACK
      4 usec_timeout=100000 usect_delayed=20000 reason=VMSCAN_THROTTLE_WRITEBACK
      5 usec_timeout=100000 usect_delayed=12000 reason=VMSCAN_THROTTLE_WRITEBACK
      6 usec_timeout=100000 usect_delayed=16000 reason=VMSCAN_THROTTLE_WRITEBACK
     11 usec_timeout=100000 usect_delayed=100000 reason=VMSCAN_THROTTLE_WRITEBACK
     11 usec_timeout=100000 usect_delayed=8000 reason=VMSCAN_THROTTLE_WRITEBACK
     94 usec_timeout=100000 usect_delayed=0 reason=VMSCAN_THROTTLE_WRITEBACK
    112 usec_timeout=100000 usect_delayed=4000 reason=VMSCAN_THROTTLE_WRITEBACK

The majority of events did not stall or stalled for a short period.
Roughly 16% of stalls reached the timeout before expiry.  For direct
reclaim, the number of times stalled for each reason were

   6624 reason=VMSCAN_THROTTLE_ISOLATED
  93246 reason=VMSCAN_THROTTLE_NOPROGRESS
  96934 reason=VMSCAN_THROTTLE_WRITEBACK

The most common reason to stall was due to excessive pages tagged for
immediate reclaim at the tail of the LRU followed by a failure to make
forward.  A relatively small number were due to too many pages isolated
from the LRU by parallel threads

For VMSCAN_THROTTLE_ISOLATED, the breakdown of delays was

      9 usec_timeout=20000 usect_delayed=4000 reason=VMSCAN_THROTTLE_ISOLATED
     12 usec_timeout=20000 usect_delayed=16000 reason=VMSCAN_THROTTLE_ISOLATED
     83 usec_timeout=20000 usect_delayed=20000 reason=VMSCAN_THROTTLE_ISOLATED
   6520 usec_timeout=20000 usect_delayed=0 reason=VMSCAN_THROTTLE_ISOLATED

Most did not stall at all.  A small number reached the timeout.

For VMSCAN_THROTTLE_NOPROGRESS, the breakdown of stalls were all over
the map

      1 usec_timeout=500000 usect_delayed=324000 reason=VMSCAN_THROTTLE_NOPROGRESS
      1 usec_timeout=500000 usect_delayed=332000 reason=VMSCAN_THROTTLE_NOPROGRESS
      1 usec_timeout=500000 usect_delayed=348000 reason=VMSCAN_THROTTLE_NOPROGRESS
      1 usec_timeout=500000 usect_delayed=360000 reason=VMSCAN_THROTTLE_NOPROGRESS
      2 usec_timeout=500000 usect_delayed=228000 reason=VMSCAN_THROTTLE_NOPROGRESS
      2 usec_timeout=500000 usect_delayed=260000 reason=VMSCAN_THROTTLE_NOPROGRESS
      2 usec_timeout=500000 usect_delayed=340000 reason=VMSCAN_THROTTLE_NOPROGRESS
      2 usec_timeout=500000 usect_delayed=364000 reason=VMSCAN_THROTTLE_NOPROGRESS
      2 usec_timeout=500000 usect_delayed=372000 reason=VMSCAN_THROTTLE_NOPROGRESS
      2 usec_timeout=500000 usect_delayed=428000 reason=VMSCAN_THROTTLE_NOPROGRESS
      2 usec_timeout=500000 usect_delayed=460000 reason=VMSCAN_THROTTLE_NOPROGRESS
      2 usec_timeout=500000 usect_delayed=464000 reason=VMSCAN_THROTTLE_NOPROGRESS
      3 usec_timeout=500000 usect_delayed=244000 reason=VMSCAN_THROTTLE_NOPROGRESS
      3 usec_timeout=500000 usect_delayed=252000 reason=VMSCAN_THROTTLE_NOPROGRESS
      3 usec_timeout=500000 usect_delayed=272000 reason=VMSCAN_THROTTLE_NOPROGRESS
      4 usec_timeout=500000 usect_delayed=188000 reason=VMSCAN_THROTTLE_NOPROGRESS
      4 usec_timeout=500000 usect_delayed=268000 reason=VMSCAN_THROTTLE_NOPROGRESS
      4 usec_timeout=500000 usect_delayed=328000 reason=VMSCAN_THROTTLE_NOPROGRESS
      4 usec_timeout=500000 usect_delayed=380000 reason=VMSCAN_THROTTLE_NOPROGRESS
      4 usec_timeout=500000 usect_delayed=392000 reason=VMSCAN_THROTTLE_NOPROGRESS
      4 usec_timeout=500000 usect_delayed=432000 reason=VMSCAN_THROTTLE_NOPROGRESS
      5 usec_timeout=500000 usect_delayed=204000 reason=VMSCAN_THROTTLE_NOPROGRESS
      5 usec_timeout=500000 usect_delayed=220000 reason=VMSCAN_THROTTLE_NOPROGRESS
      5 usec_timeout=500000 usect_delayed=412000 reason=VMSCAN_THROTTLE_NOPROGRESS
      5 usec_timeout=500000 usect_delayed=436000 reason=VMSCAN_THROTTLE_NOPROGRESS
      6 usec_timeout=500000 usect_delayed=488000 reason=VMSCAN_THROTTLE_NOPROGRESS
      7 usec_timeout=500000 usect_delayed=212000 reason=VMSCAN_THROTTLE_NOPROGRESS
      7 usec_timeout=500000 usect_delayed=300000 reason=VMSCAN_THROTTLE_NOPROGRESS
      7 usec_timeout=500000 usect_delayed=316000 reason=VMSCAN_THROTTLE_NOPROGRESS
      7 usec_timeout=500000 usect_delayed=472000 reason=VMSCAN_THROTTLE_NOPROGRESS
      8 usec_timeout=500000 usect_delayed=248000 reason=VMSCAN_THROTTLE_NOPROGRESS
      8 usec_timeout=500000 usect_delayed=356000 reason=VMSCAN_THROTTLE_NOPROGRESS
      8 usec_timeout=500000 usect_delayed=456000 reason=VMSCAN_THROTTLE_NOPROGRESS
      9 usec_timeout=500000 usect_delayed=124000 reason=VMSCAN_THROTTLE_NOPROGRESS
      9 usec_timeout=500000 usect_delayed=376000 reason=VMSCAN_THROTTLE_NOPROGRESS
      9 usec_timeout=500000 usect_delayed=484000 reason=VMSCAN_THROTTLE_NOPROGRESS
     10 usec_timeout=500000 usect_delayed=172000 reason=VMSCAN_THROTTLE_NOPROGRESS
     10 usec_timeout=500000 usect_delayed=420000 reason=VMSCAN_THROTTLE_NOPROGRESS
     10 usec_timeout=500000 usect_delayed=452000 reason=VMSCAN_THROTTLE_NOPROGRESS
     11 usec_timeout=500000 usect_delayed=256000 reason=VMSCAN_THROTTLE_NOPROGRESS
     12 usec_timeout=500000 usect_delayed=112000 reason=VMSCAN_THROTTLE_NOPROGRESS
     12 usec_timeout=500000 usect_delayed=116000 reason=VMSCAN_THROTTLE_NOPROGRESS
     12 usec_timeout=500000 usect_delayed=144000 reason=VMSCAN_THROTTLE_NOPROGRESS
     12 usec_timeout=500000 usect_delayed=152000 reason=VMSCAN_THROTTLE_NOPROGRESS
     12 usec_timeout=500000 usect_delayed=264000 reason=VMSCAN_THROTTLE_NOPROGRESS
     12 usec_timeout=500000 usect_delayed=384000 reason=VMSCAN_THROTTLE_NOPROGRESS
     12 usec_timeout=500000 usect_delayed=424000 reason=VMSCAN_THROTTLE_NOPROGRESS
     12 usec_timeout=500000 usect_delayed=492000 reason=VMSCAN_THROTTLE_NOPROGRESS
     13 usec_timeout=500000 usect_delayed=184000 reason=VMSCAN_THROTTLE_NOPROGRESS
     13 usec_timeout=500000 usect_delayed=444000 reason=VMSCAN_THROTTLE_NOPROGRESS
     14 usec_timeout=500000 usect_delayed=308000 reason=VMSCAN_THROTTLE_NOPROGRESS
     14 usec_timeout=500000 usect_delayed=440000 reason=VMSCAN_THROTTLE_NOPROGRESS
     14 usec_timeout=500000 usect_delayed=476000 reason=VMSCAN_THROTTLE_NOPROGRESS
     16 usec_timeout=500000 usect_delayed=140000 reason=VMSCAN_THROTTLE_NOPROGRESS
     17 usec_timeout=500000 usect_delayed=232000 reason=VMSCAN_THROTTLE_NOPROGRESS
     17 usec_timeout=500000 usect_delayed=240000 reason=VMSCAN_THROTTLE_NOPROGRESS
     17 usec_timeout=500000 usect_delayed=280000 reason=VMSCAN_THROTTLE_NOPROGRESS
     18 usec_timeout=500000 usect_delayed=404000 reason=VMSCAN_THROTTLE_NOPROGRESS
     20 usec_timeout=500000 usect_delayed=148000 reason=VMSCAN_THROTTLE_NOPROGRESS
     20 usec_timeout=500000 usect_delayed=216000 reason=VMSCAN_THROTTLE_NOPROGRESS
     20 usec_timeout=500000 usect_delayed=468000 reason=VMSCAN_THROTTLE_NOPROGRESS
     21 usec_timeout=500000 usect_delayed=448000 reason=VMSCAN_THROTTLE_NOPROGRESS
     23 usec_timeout=500000 usect_delayed=168000 reason=VMSCAN_THROTTLE_NOPROGRESS
     23 usec_timeout=500000 usect_delayed=296000 reason=VMSCAN_THROTTLE_NOPROGRESS
     25 usec_timeout=500000 usect_delayed=132000 reason=VMSCAN_THROTTLE_NOPROGRESS
     25 usec_timeout=500000 usect_delayed=352000 reason=VMSCAN_THROTTLE_NOPROGRESS
     26 usec_timeout=500000 usect_delayed=180000 reason=VMSCAN_THROTTLE_NOPROGRESS
     27 usec_timeout=500000 usect_delayed=284000 reason=VMSCAN_THROTTLE_NOPROGRESS
     28 usec_timeout=500000 usect_delayed=164000 reason=VMSCAN_THROTTLE_NOPROGRESS
     29 usec_timeout=500000 usect_delayed=136000 reason=VMSCAN_THROTTLE_NOPROGRESS
     30 usec_timeout=500000 usect_delayed=200000 reason=VMSCAN_THROTTLE_NOPROGRESS
     30 usec_timeout=500000 usect_delayed=400000 reason=VMSCAN_THROTTLE_NOPROGRESS
     31 usec_timeout=500000 usect_delayed=196000 reason=VMSCAN_THROTTLE_NOPROGRESS
     32 usec_timeout=500000 usect_delayed=156000 reason=VMSCAN_THROTTLE_NOPROGRESS
     33 usec_timeout=500000 usect_delayed=224000 reason=VMSCAN_THROTTLE_NOPROGRESS
     35 usec_timeout=500000 usect_delayed=128000 reason=VMSCAN_THROTTLE_NOPROGRESS
     35 usec_timeout=500000 usect_delayed=176000 reason=VMSCAN_THROTTLE_NOPROGRESS
     36 usec_timeout=500000 usect_delayed=368000 reason=VMSCAN_THROTTLE_NOPROGRESS
     36 usec_timeout=500000 usect_delayed=496000 reason=VMSCAN_THROTTLE_NOPROGRESS
     37 usec_timeout=500000 usect_delayed=312000 reason=VMSCAN_THROTTLE_NOPROGRESS
     38 usec_timeout=500000 usect_delayed=304000 reason=VMSCAN_THROTTLE_NOPROGRESS
     40 usec_timeout=500000 usect_delayed=288000 reason=VMSCAN_THROTTLE_NOPROGRESS
     43 usec_timeout=500000 usect_delayed=408000 reason=VMSCAN_THROTTLE_NOPROGRESS
     55 usec_timeout=500000 usect_delayed=416000 reason=VMSCAN_THROTTLE_NOPROGRESS
     56 usec_timeout=500000 usect_delayed=76000 reason=VMSCAN_THROTTLE_NOPROGRESS
     58 usec_timeout=500000 usect_delayed=120000 reason=VMSCAN_THROTTLE_NOPROGRESS
     59 usec_timeout=500000 usect_delayed=208000 reason=VMSCAN_THROTTLE_NOPROGRESS
     61 usec_timeout=500000 usect_delayed=68000 reason=VMSCAN_THROTTLE_NOPROGRESS
     71 usec_timeout=500000 usect_delayed=192000 reason=VMSCAN_THROTTLE_NOPROGRESS
     71 usec_timeout=500000 usect_delayed=480000 reason=VMSCAN_THROTTLE_NOPROGRESS
     79 usec_timeout=500000 usect_delayed=60000 reason=VMSCAN_THROTTLE_NOPROGRESS
     82 usec_timeout=500000 usect_delayed=320000 reason=VMSCAN_THROTTLE_NOPROGRESS
     82 usec_timeout=500000 usect_delayed=92000 reason=VMSCAN_THROTTLE_NOPROGRESS
     85 usec_timeout=500000 usect_delayed=64000 reason=VMSCAN_THROTTLE_NOPROGRESS
     85 usec_timeout=500000 usect_delayed=80000 reason=VMSCAN_THROTTLE_NOPROGRESS
     88 usec_timeout=500000 usect_delayed=84000 reason=VMSCAN_THROTTLE_NOPROGRESS
     90 usec_timeout=500000 usect_delayed=160000 reason=VMSCAN_THROTTLE_NOPROGRESS
     90 usec_timeout=500000 usect_delayed=292000 reason=VMSCAN_THROTTLE_NOPROGRESS
     94 usec_timeout=500000 usect_delayed=56000 reason=VMSCAN_THROTTLE_NOPROGRESS
    118 usec_timeout=500000 usect_delayed=88000 reason=VMSCAN_THROTTLE_NOPROGRESS
    119 usec_timeout=500000 usect_delayed=72000 reason=VMSCAN_THROTTLE_NOPROGRESS
    126 usec_timeout=500000 usect_delayed=108000 reason=VMSCAN_THROTTLE_NOPROGRESS
    146 usec_timeout=500000 usect_delayed=52000 reason=VMSCAN_THROTTLE_NOPROGRESS
    148 usec_timeout=500000 usect_delayed=36000 reason=VMSCAN_THROTTLE_NOPROGRESS
    148 usec_timeout=500000 usect_delayed=48000 reason=VMSCAN_THROTTLE_NOPROGRESS
    159 usec_timeout=500000 usect_delayed=28000 reason=VMSCAN_THROTTLE_NOPROGRESS
    178 usec_timeout=500000 usect_delayed=44000 reason=VMSCAN_THROTTLE_NOPROGRESS
    183 usec_timeout=500000 usect_delayed=40000 reason=VMSCAN_THROTTLE_NOPROGRESS
    237 usec_timeout=500000 usect_delayed=100000 reason=VMSCAN_THROTTLE_NOPROGRESS
    266 usec_timeout=500000 usect_delayed=32000 reason=VMSCAN_THROTTLE_NOPROGRESS
    313 usec_timeout=500000 usect_delayed=24000 reason=VMSCAN_THROTTLE_NOPROGRESS
    347 usec_timeout=500000 usect_delayed=96000 reason=VMSCAN_THROTTLE_NOPROGRESS
    470 usec_timeout=500000 usect_delayed=20000 reason=VMSCAN_THROTTLE_NOPROGRESS
    559 usec_timeout=500000 usect_delayed=16000 reason=VMSCAN_THROTTLE_NOPROGRESS
    964 usec_timeout=500000 usect_delayed=12000 reason=VMSCAN_THROTTLE_NOPROGRESS
   2001 usec_timeout=500000 usect_delayed=104000 reason=VMSCAN_THROTTLE_NOPROGRESS
   2447 usec_timeout=500000 usect_delayed=8000 reason=VMSCAN_THROTTLE_NOPROGRESS
   7888 usec_timeout=500000 usect_delayed=4000 reason=VMSCAN_THROTTLE_NOPROGRESS
  22727 usec_timeout=500000 usect_delayed=0 reason=VMSCAN_THROTTLE_NOPROGRESS
  51305 usec_timeout=500000 usect_delayed=500000 reason=VMSCAN_THROTTLE_NOPROGRESS

The full timeout is often hit but a large number also do not stall at
all.  The remainder slept a little allowing other reclaim tasks to make
progress.

While this timeout could be further increased, it could also negatively
impact worst-case behaviour when there is no prioritisation of what task
should make progress.

For VMSCAN_THROTTLE_WRITEBACK, the breakdown was

      1 usec_timeout=100000 usect_delayed=44000 reason=VMSCAN_THROTTLE_WRITEBACK
      2 usec_timeout=100000 usect_delayed=76000 reason=VMSCAN_THROTTLE_WRITEBACK
      3 usec_timeout=100000 usect_delayed=80000 reason=VMSCAN_THROTTLE_WRITEBACK
      5 usec_timeout=100000 usect_delayed=48000 reason=VMSCAN_THROTTLE_WRITEBACK
      5 usec_timeout=100000 usect_delayed=84000 reason=VMSCAN_THROTTLE_WRITEBACK
      6 usec_timeout=100000 usect_delayed=72000 reason=VMSCAN_THROTTLE_WRITEBACK
      7 usec_timeout=100000 usect_delayed=88000 reason=VMSCAN_THROTTLE_WRITEBACK
     11 usec_timeout=100000 usect_delayed=56000 reason=VMSCAN_THROTTLE_WRITEBACK
     12 usec_timeout=100000 usect_delayed=64000 reason=VMSCAN_THROTTLE_WRITEBACK
     16 usec_timeout=100000 usect_delayed=92000 reason=VMSCAN_THROTTLE_WRITEBACK
     24 usec_timeout=100000 usect_delayed=68000 reason=VMSCAN_THROTTLE_WRITEBACK
     28 usec_timeout=100000 usect_delayed=32000 reason=VMSCAN_THROTTLE_WRITEBACK
     30 usec_timeout=100000 usect_delayed=60000 reason=VMSCAN_THROTTLE_WRITEBACK
     30 usec_timeout=100000 usect_delayed=96000 reason=VMSCAN_THROTTLE_WRITEBACK
     32 usec_timeout=100000 usect_delayed=52000 reason=VMSCAN_THROTTLE_WRITEBACK
     42 usec_timeout=100000 usect_delayed=40000 reason=VMSCAN_THROTTLE_WRITEBACK
     77 usec_timeout=100000 usect_delayed=28000 reason=VMSCAN_THROTTLE_WRITEBACK
     99 usec_timeout=100000 usect_delayed=36000 reason=VMSCAN_THROTTLE_WRITEBACK
    137 usec_timeout=100000 usect_delayed=24000 reason=VMSCAN_THROTTLE_WRITEBACK
    190 usec_timeout=100000 usect_delayed=20000 reason=VMSCAN_THROTTLE_WRITEBACK
    339 usec_timeout=100000 usect_delayed=16000 reason=VMSCAN_THROTTLE_WRITEBACK
    518 usec_timeout=100000 usect_delayed=12000 reason=VMSCAN_THROTTLE_WRITEBACK
    852 usec_timeout=100000 usect_delayed=8000 reason=VMSCAN_THROTTLE_WRITEBACK
   3359 usec_timeout=100000 usect_delayed=4000 reason=VMSCAN_THROTTLE_WRITEBACK
   7147 usec_timeout=100000 usect_delayed=0 reason=VMSCAN_THROTTLE_WRITEBACK
  83962 usec_timeout=100000 usect_delayed=100000 reason=VMSCAN_THROTTLE_WRITEBACK

The majority hit the timeout in direct reclaim context although a
sizable number did not stall at all.  This is very different to kswapd
where only a tiny percentage of stalls due to writeback reached the
timeout.

Bottom line, the throttling appears to work and the wakeup events may
limit worst case stalls.  There might be some grounds for adjusting
timeouts but it's likely futile as the worst-case scenarios depend on
the workload, memory size and the speed of the storage.  A better
approach to improve the series further would be to prioritise tasks
based on their rate of allocation with the caveat that it may be very
expensive to track.

This patch (of 5):

Page reclaim throttles on wait_iff_congested under the following
conditions:

 - kswapd is encountering pages under writeback and marked for immediate
   reclaim implying that pages are cycling through the LRU faster than
   pages can be cleaned.

 - Direct reclaim will stall if all dirty pages are backed by congested
   inodes.

wait_iff_congested is almost completely broken with few exceptions.
This patch adds a new node-based workqueue and tracks the number of
throttled tasks and pages written back since throttling started.  If
enough pages belonging to the node are written back then the throttled
tasks will wake early.  If not, the throttled tasks sleeps until the
timeout expires.

[neilb@suse.de: Uninterruptible sleep and simpler wakeups]
[hdanton@sina.com: Avoid race when reclaim starts]
[vbabka@suse.cz: vmstat irq-safe api, clarifications]

Link: https://lore.kernel.org/linux-mm/45d8b7a6-8548-65f5-cccf-9f451d4ae3d4@kernel.dk/ [1]
Link: https://lkml.kernel.org/r/20211022144651.19914-1-mgorman@techsingularity.net
Link: https://lkml.kernel.org/r/20211022144651.19914-2-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: NeilBrown <neilb@suse.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: "Darrick J . Wong" <djwong@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev.h |  1 -
 include/linux/mmzone.h      | 13 +++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index ac7f231b8825..9fb1f0ae273c 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -154,7 +154,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 }
 
 long congestion_wait(int sync, long timeout);
-long wait_iff_congested(int sync, long timeout);
 
 static inline bool mapping_can_writeback(struct address_space *mapping)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fb36a29e3aae..5a5e19b90dab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -199,6 +199,7 @@ enum node_stat_item {
 	NR_VMSCAN_IMMEDIATE,	/* Prioritise for reclaim when writeback ends */
 	NR_DIRTIED,		/* page dirtyings since bootup */
 	NR_WRITTEN,		/* page writings since bootup */
+	NR_THROTTLED_WRITTEN,	/* NR_WRITTEN while reclaim throttled */
 	NR_KERNEL_MISC_RECLAIMABLE,	/* reclaimable non-slab kernel pages */
 	NR_FOLL_PIN_ACQUIRED,	/* via: pin_user_page(), gup flag: FOLL_PIN */
 	NR_FOLL_PIN_RELEASED,	/* pages returned via unpin_user_page() */
@@ -272,6 +273,11 @@ enum lru_list {
 	NR_LRU_LISTS
 };
 
+enum vmscan_throttle_state {
+	VMSCAN_THROTTLE_WRITEBACK,
+	NR_VMSCAN_THROTTLE,
+};
+
 #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
 
 #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
@@ -841,6 +847,13 @@ typedef struct pglist_data {
 	int node_id;
 	wait_queue_head_t kswapd_wait;
 	wait_queue_head_t pfmemalloc_wait;
+
+	/* workqueues for throttling reclaim for different reasons. */
+	wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE];
+
+	atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
+	unsigned long nr_reclaim_start;	/* nr pages written while throttled
+					 * when throttling started. */
 	struct task_struct *kswapd;	/* Protected by
 					   mem_hotplug_begin/end() */
 	int kswapd_order;
-- 
cgit v1.2.3


From d818fca1cac31b1fc9301bda83e195a46fb4ebaa Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 5 Nov 2021 13:42:29 -0700
Subject: mm/vmscan: throttle reclaim and compaction when too may pages are
 isolated

Page reclaim throttles on congestion if too many parallel reclaim
instances have isolated too many pages.  This makes no sense, excessive
parallelisation has nothing to do with writeback or congestion.

This patch creates an additional workqueue to sleep on when too many
pages are isolated.  The throttled tasks are woken when the number of
isolated pages is reduced or a timeout occurs.  There may be some false
positive wakeups for GFP_NOIO/GFP_NOFS callers but the tasks will
throttle again if necessary.

[shy828301@gmail.com: Wake up from compaction context]
[vbabka@suse.cz: Account number of throttled tasks only for writeback]

Link: https://lkml.kernel.org/r/20211022144651.19914-3-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: "Darrick J . Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Rik van Riel <riel@surriel.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5a5e19b90dab..312c1ea9aafa 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -275,6 +275,7 @@ enum lru_list {
 
 enum vmscan_throttle_state {
 	VMSCAN_THROTTLE_WRITEBACK,
+	VMSCAN_THROTTLE_ISOLATED,
 	NR_VMSCAN_THROTTLE,
 };
 
-- 
cgit v1.2.3


From 69392a403f49e6e33f9dfb1d6edb87c8006f83c2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 5 Nov 2021 13:42:32 -0700
Subject: mm/vmscan: throttle reclaim when no progress is being made

Memcg reclaim throttles on congestion if no reclaim progress is made.
This makes little sense, it might be due to writeback or a host of other
factors.

For !memcg reclaim, it's messy.  Direct reclaim primarily is throttled
in the page allocator if it is failing to make progress.  Kswapd
throttles if too many pages are under writeback and marked for immediate
reclaim.

This patch explicitly throttles if reclaim is failing to make progress.

[vbabka@suse.cz: Remove redundant code]

Link: https://lkml.kernel.org/r/20211022144651.19914-4-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: "Darrick J . Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Rik van Riel <riel@surriel.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 312c1ea9aafa..58e744b78c2c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -276,6 +276,7 @@ enum lru_list {
 enum vmscan_throttle_state {
 	VMSCAN_THROTTLE_WRITEBACK,
 	VMSCAN_THROTTLE_ISOLATED,
+	VMSCAN_THROTTLE_NOPROGRESS,
 	NR_VMSCAN_THROTTLE,
 };
 
-- 
cgit v1.2.3


From 7e6ec49c18988f1b8dab0677271dafde5f8d9a43 Mon Sep 17 00:00:00 2001
From: Yuanzheng Song <songyuanzheng@huawei.com>
Date: Fri, 5 Nov 2021 13:42:52 -0700
Subject: mm/vmpressure: fix data-race with memcg->socket_pressure

When reading memcg->socket_pressure in mem_cgroup_under_socket_pressure()
and writing memcg->socket_pressure in vmpressure() at the same time, the
following data-race occurs:

  BUG: KCSAN: data-race in __sk_mem_reduce_allocated / vmpressure

  write to 0xffff8881286f4938 of 8 bytes by task 24550 on cpu 3:
   vmpressure+0x218/0x230 mm/vmpressure.c:307
   shrink_node_memcgs+0x2b9/0x410 mm/vmscan.c:2658
   shrink_node+0x9d2/0x11d0 mm/vmscan.c:2769
   shrink_zones+0x29f/0x470 mm/vmscan.c:2972
   do_try_to_free_pages+0x193/0x6e0 mm/vmscan.c:3027
   try_to_free_mem_cgroup_pages+0x1c0/0x3f0 mm/vmscan.c:3345
   reclaim_high mm/memcontrol.c:2440 [inline]
   mem_cgroup_handle_over_high+0x18b/0x4d0 mm/memcontrol.c:2624
   tracehook_notify_resume include/linux/tracehook.h:197 [inline]
   exit_to_user_mode_loop kernel/entry/common.c:164 [inline]
   exit_to_user_mode_prepare+0x110/0x170 kernel/entry/common.c:191
   syscall_exit_to_user_mode+0x16/0x30 kernel/entry/common.c:266
   ret_from_fork+0x15/0x30 arch/x86/entry/entry_64.S:289

  read to 0xffff8881286f4938 of 8 bytes by interrupt on cpu 1:
   mem_cgroup_under_socket_pressure include/linux/memcontrol.h:1483 [inline]
   sk_under_memory_pressure include/net/sock.h:1314 [inline]
   __sk_mem_reduce_allocated+0x1d2/0x270 net/core/sock.c:2696
   __sk_mem_reclaim+0x44/0x50 net/core/sock.c:2711
   sk_mem_reclaim include/net/sock.h:1490 [inline]
   ......
   net_rx_action+0x17a/0x480 net/core/dev.c:6864
   __do_softirq+0x12c/0x2af kernel/softirq.c:298
   run_ksoftirqd+0x13/0x20 kernel/softirq.c:653
   smpboot_thread_fn+0x33f/0x510 kernel/smpboot.c:165
   kthread+0x1fc/0x220 kernel/kthread.c:292
   ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296

Fix it by using READ_ONCE() and WRITE_ONCE() to read and write
memcg->socket_pressure.

Link: https://lkml.kernel.org/r/20211025082843.671690-1-songyuanzheng@huawei.com
Signed-off-by: Yuanzheng Song <songyuanzheng@huawei.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f207f98bdb76..9d96238d9c21 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1606,7 +1606,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
 		return true;
 	do {
-		if (time_before(jiffies, memcg->socket_pressure))
+		if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
 			return true;
 	} while ((memcg = parent_mem_cgroup(memcg)));
 	return false;
-- 
cgit v1.2.3


From fa27717110ae51b9b9013ced0b5143888257bb79 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Fri, 5 Nov 2021 13:43:13 -0700
Subject: memblock: drop memblock_free_early_nid() and memblock_free_early()

memblock_free_early_nid() is unused and memblock_free_early() is an
alias for memblock_free().

Replace calls to memblock_free_early() with calls to memblock_free() and
remove memblock_free_early() and memblock_free_early_nid().

Link: https://lkml.kernel.org/r/20210930185031.18648-4-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Juergen Gross <jgross@suse.com>
Cc: Shahab Vahedi <Shahab.Vahedi@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 34de69b3b8ba..fc8183be340c 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -441,18 +441,6 @@ static inline void *memblock_alloc_node(phys_addr_t size,
 				      MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 }
 
-static inline void memblock_free_early(phys_addr_t base,
-					      phys_addr_t size)
-{
-	memblock_free(base, size);
-}
-
-static inline void memblock_free_early_nid(phys_addr_t base,
-						  phys_addr_t size, int nid)
-{
-	memblock_free(base, size);
-}
-
 static inline void memblock_free_late(phys_addr_t base, phys_addr_t size)
 {
 	__memblock_free_late(base, size);
-- 
cgit v1.2.3


From 621d973901cf9fa6c6e31b31bdd36c5c5f3c9c9e Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Fri, 5 Nov 2021 13:43:16 -0700
Subject: memblock: stop aliasing __memblock_free_late with memblock_free_late

memblock_free_late() is a NOP wrapper for __memblock_free_late(), there
is no point to keep this indirection.

Drop the wrapper and rename __memblock_free_late() to
memblock_free_late().

Link: https://lkml.kernel.org/r/20210930185031.18648-5-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Juergen Gross <jgross@suse.com>
Cc: Shahab Vahedi <Shahab.Vahedi@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index fc8183be340c..e25f964fdd60 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -133,7 +133,7 @@ void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags,
 			  struct memblock_type *type_b, phys_addr_t *out_start,
 			  phys_addr_t *out_end, int *out_nid);
 
-void __memblock_free_late(phys_addr_t base, phys_addr_t size);
+void memblock_free_late(phys_addr_t base, phys_addr_t size);
 
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
 static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
@@ -441,11 +441,6 @@ static inline void *memblock_alloc_node(phys_addr_t size,
 				      MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 }
 
-static inline void memblock_free_late(phys_addr_t base, phys_addr_t size)
-{
-	__memblock_free_late(base, size);
-}
-
 /*
  * Set the allocation direction to bottom-up or top-down.
  */
-- 
cgit v1.2.3


From 3ecc68349bbab6bff1d12cbc7951ca6019b2faf6 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Fri, 5 Nov 2021 13:43:19 -0700
Subject: memblock: rename memblock_free to memblock_phys_free

Since memblock_free() operates on a physical range, make its name
reflect it and rename it to memblock_phys_free(), so it will be a
logical counterpart to memblock_phys_alloc().

The callers are updated with the below semantic patch:

    @@
    expression addr;
    expression size;
    @@
    - memblock_free(addr, size);
    + memblock_phys_free(addr, size);

Link: https://lkml.kernel.org/r/20210930185031.18648-6-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Juergen Gross <jgross@suse.com>
Cc: Shahab Vahedi <Shahab.Vahedi@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index e25f964fdd60..d32d41709513 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -103,7 +103,7 @@ void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
 int memblock_remove(phys_addr_t base, phys_addr_t size);
-int memblock_free(phys_addr_t base, phys_addr_t size);
+int memblock_phys_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
 int memblock_physmem_add(phys_addr_t base, phys_addr_t size);
-- 
cgit v1.2.3


From 4421cca0a3e4833b3bf0f20de98eb580ab8c7290 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Fri, 5 Nov 2021 13:43:22 -0700
Subject: memblock: use memblock_free for freeing virtual pointers

Rename memblock_free_ptr() to memblock_free() and use memblock_free()
when freeing a virtual pointer so that memblock_free() will be a
counterpart of memblock_alloc()

The callers are updated with the below semantic patch and manual
addition of (void *) casting to pointers that are represented by
unsigned long variables.

    @@
    identifier vaddr;
    expression size;
    @@
    (
    - memblock_phys_free(__pa(vaddr), size);
    + memblock_free(vaddr, size);
    |
    - memblock_free_ptr(vaddr, size);
    + memblock_free(vaddr, size);
    )

[sfr@canb.auug.org.au: fixup]
  Link: https://lkml.kernel.org/r/20211018192940.3d1d532f@canb.auug.org.au

Link: https://lkml.kernel.org/r/20210930185031.18648-7-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Juergen Gross <jgross@suse.com>
Cc: Shahab Vahedi <Shahab.Vahedi@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index d32d41709513..484650681bee 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -118,7 +118,7 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
 
 void memblock_free_all(void);
-void memblock_free_ptr(void *ptr, size_t size);
+void memblock_free(void *ptr, size_t size);
 void reset_node_managed_pages(pg_data_t *pgdat);
 void reset_all_zones_managed_pages(void);
 
-- 
cgit v1.2.3


From b5389086ad7be0453c55e0069a89856d1fbdf605 Mon Sep 17 00:00:00 2001
From: Zhenguo Yao <yaozhenguo1@gmail.com>
Date: Fri, 5 Nov 2021 13:43:28 -0700
Subject: hugetlbfs: extend the definition of hugepages parameter to support
 node allocation

We can specify the number of hugepages to allocate at boot.  But the
hugepages is balanced in all nodes at present.  In some scenarios, we
only need hugepages in one node.  For example: DPDK needs hugepages
which are in the same node as NIC.

If DPDK needs four hugepages of 1G size in node1 and system has 16 numa
nodes we must reserve 64 hugepages on the kernel cmdline.  But only four
hugepages are used.  The others should be free after boot.  If the
system memory is low(for example: 64G), it will be an impossible task.

So extend the hugepages parameter to support specifying hugepages on a
specific node.  For example add following parameter:

  hugepagesz=1G hugepages=0:1,1:3

It will allocate 1 hugepage in node0 and 3 hugepages in node1.

Link: https://lkml.kernel.org/r/20211005054729.86457-1-yaozhenguo1@gmail.com
Signed-off-by: Zhenguo Yao <yaozhenguo1@gmail.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Zhenguo Yao <yaozhenguo1@gmail.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c0a20781b28e..44c2ab0dfa59 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -615,6 +615,7 @@ struct hstate {
 	unsigned long nr_overcommit_huge_pages;
 	struct list_head hugepage_activelist;
 	struct list_head hugepage_freelists[MAX_NUMNODES];
+	unsigned int max_huge_pages_node[MAX_NUMNODES];
 	unsigned int nr_huge_pages_node[MAX_NUMNODES];
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
@@ -647,8 +648,9 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
 				unsigned long address, struct page *page);
 
 /* arch callback */
-int __init __alloc_bootmem_huge_page(struct hstate *h);
-int __init alloc_bootmem_huge_page(struct hstate *h);
+int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
+int __init alloc_bootmem_huge_page(struct hstate *h, int nid);
+bool __init hugetlb_node_alloc_supported(void);
 
 void __init hugetlb_add_hstate(unsigned order);
 bool __init arch_hugetlb_valid_size(unsigned long size);
-- 
cgit v1.2.3


From 8eb42beac8d369f5443addd469879d152422a28d Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Fri, 5 Nov 2021 13:43:32 -0700
Subject: mm/migrate: de-duplicate migrate_reason strings

In order to remove the need to manually keep three different files in
synch, provide a common definition of the mapping between enum
migrate_reason, and the associated strings for each enum item.

1. Use the tracing system's mapping of enums to strings, by redefining
   and reusing the MIGRATE_REASON and supporting macros, and using that
   to populate the string array in mm/debug.c.

2. Move enum migrate_reason to migrate_mode.h. This is not strictly
   necessary for this patch, but migrate mode and migrate reason go
   together, so this will slightly clarify things.

Link: https://lkml.kernel.org/r/20210922041755.141817-2-jhubbard@nvidia.com
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Weizhao Ouyang <o451686892@gmail.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h      | 19 +------------------
 include/linux/migrate_mode.h | 13 +++++++++++++
 2 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index c8077e936691..3d154fe03c96 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -19,24 +19,7 @@ struct migration_target_control;
  */
 #define MIGRATEPAGE_SUCCESS		0
 
-/*
- * Keep sync with:
- * - macro MIGRATE_REASON in include/trace/events/migrate.h
- * - migrate_reason_names[MR_TYPES] in mm/debug.c
- */
-enum migrate_reason {
-	MR_COMPACTION,
-	MR_MEMORY_FAILURE,
-	MR_MEMORY_HOTPLUG,
-	MR_SYSCALL,		/* also applies to cpusets */
-	MR_MEMPOLICY_MBIND,
-	MR_NUMA_MISPLACED,
-	MR_CONTIG_RANGE,
-	MR_LONGTERM_PIN,
-	MR_DEMOTION,
-	MR_TYPES
-};
-
+/* Defined in mm/debug.c: */
 extern const char *migrate_reason_names[MR_TYPES];
 
 #ifdef CONFIG_MIGRATION
diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index 883c99249033..f37cc03f9369 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -19,4 +19,17 @@ enum migrate_mode {
 	MIGRATE_SYNC_NO_COPY,
 };
 
+enum migrate_reason {
+	MR_COMPACTION,
+	MR_MEMORY_FAILURE,
+	MR_MEMORY_HOTPLUG,
+	MR_SYSCALL,		/* also applies to cpusets */
+	MR_MEMPOLICY_MBIND,
+	MR_NUMA_MISPLACED,
+	MR_CONTIG_RANGE,
+	MR_LONGTERM_PIN,
+	MR_DEMOTION,
+	MR_TYPES
+};
+
 #endif		/* MIGRATE_MODE_H_INCLUDED */
-- 
cgit v1.2.3


From 20f9ba4f995247bb79e243741b8fdddbd76dd923 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Fri, 5 Nov 2021 13:43:35 -0700
Subject: mm: migrate: make demotion knob depend on migration

The memory demotion needs to call migrate_pages() to do the jobs.  And
it is controlled by a knob, however, the knob doesn't depend on
CONFIG_MIGRATION.  The knob could be truned on even though MIGRATION is
disabled, this will not cause any crash since migrate_pages() would just
return -ENOSYS.  But it is definitely not optimal to go through demotion
path then retry regular swap every time.

And it doesn't make too much sense to have the knob visible to the users
when !MIGRATION.  Move the related code from mempolicy.[h|c] to
migrate.[h|c].

Link: https://lkml.kernel.org/r/20211015005559.246709-1-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Acked-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 4 ----
 include/linux/migrate.h   | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 097bbbb37493..3c7595e81150 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -183,8 +183,6 @@ extern bool vma_migratable(struct vm_area_struct *vma);
 extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
 extern void mpol_put_task_policy(struct task_struct *);
 
-extern bool numa_demotion_enabled;
-
 static inline bool mpol_is_preferred_many(struct mempolicy *pol)
 {
 	return  (pol->mode == MPOL_PREFERRED_MANY);
@@ -300,8 +298,6 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
 	return NULL;
 }
 
-#define numa_demotion_enabled	false
-
 static inline bool mpol_is_preferred_many(struct mempolicy *pol)
 {
 	return  false;
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 3d154fe03c96..2d8130e05dc0 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -40,6 +40,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page, int extra_count);
+
+extern bool numa_demotion_enabled;
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
@@ -65,6 +67,8 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 {
 	return -ENOSYS;
 }
+
+#define numa_demotion_enabled	false
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_COMPACTION
-- 
cgit v1.2.3


From 50f9481ed9fb8a2d2a06a155634c7f9eeff9fa61 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 5 Nov 2021 13:44:24 -0700
Subject: mm/memory_hotplug: remove CONFIG_MEMORY_HOTPLUG_SPARSE

CONFIG_MEMORY_HOTPLUG depends on CONFIG_SPARSEMEM, so there is no need for
CONFIG_MEMORY_HOTPLUG_SPARSE anymore; adjust all instances to use
CONFIG_MEMORY_HOTPLUG and remove CONFIG_MEMORY_HOTPLUG_SPARSE.

Link: https://lkml.kernel.org/r/20210929143600.49379-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>	[kselftest]
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Oscar Salvador <osalvador@suse.de>
Cc: Alex Shi <alexs@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h | 24 ++++++++++--------------
 include/linux/node.h   |  4 ++--
 2 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 053a530c7bdd..0328ec039c38 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -110,7 +110,7 @@ struct mem_section;
 #define SLAB_CALLBACK_PRI       1
 #define IPC_CALLBACK_PRI        10
 
-#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifndef CONFIG_MEMORY_HOTPLUG
 static inline void memory_dev_init(void)
 {
 	return;
@@ -126,7 +126,14 @@ static inline int memory_notify(unsigned long val, void *v)
 {
 	return 0;
 }
-#else
+static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
+{
+	return 0;
+}
+/* These aren't inline functions due to a GCC bug. */
+#define register_hotmemory_notifier(nb)    ({ (void)(nb); 0; })
+#define unregister_hotmemory_notifier(nb)  ({ (void)(nb); })
+#else /* CONFIG_MEMORY_HOTPLUG */
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 int create_memory_block_devices(unsigned long start, unsigned long size,
@@ -148,9 +155,6 @@ struct memory_group *memory_group_find_by_id(int mgid);
 typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *);
 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
 			       struct memory_group *excluded, void *arg);
-#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
-
-#ifdef CONFIG_MEMORY_HOTPLUG
 #define hotplug_memory_notifier(fn, pri) ({		\
 	static __meminitdata struct notifier_block fn##_mem_nb =\
 		{ .notifier_call = fn, .priority = pri };\
@@ -158,15 +162,7 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
 })
 #define register_hotmemory_notifier(nb)		register_memory_notifier(nb)
 #define unregister_hotmemory_notifier(nb) 	unregister_memory_notifier(nb)
-#else
-static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
-{
-	return 0;
-}
-/* These aren't inline functions due to a GCC bug. */
-#define register_hotmemory_notifier(nb)    ({ (void)(nb); 0; })
-#define unregister_hotmemory_notifier(nb)  ({ (void)(nb); })
-#endif
+#endif	/* CONFIG_MEMORY_HOTPLUG */
 
 /*
  * Kernel text modification mutex, used for code patching. Users of this lock
diff --git a/include/linux/node.h b/include/linux/node.h
index 8e5a29897936..bb21fd631b16 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -85,7 +85,7 @@ struct node {
 	struct device	dev;
 	struct list_head access_list;
 
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
 	struct work_struct	node_work;
 #endif
 #ifdef CONFIG_HMEM_REPORTING
@@ -98,7 +98,7 @@ struct memory_block;
 extern struct node *node_devices[];
 typedef  void (*node_registration_func_t)(struct node *);
 
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA)
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
 void link_mem_sections(int nid, unsigned long start_pfn,
 		       unsigned long end_pfn,
 		       enum meminit_context context);
-- 
cgit v1.2.3


From 6b740c6c3aa371cd70ac07f8d071f2a8af28c51c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 5 Nov 2021 13:44:31 -0700
Subject: mm/memory_hotplug: remove HIGHMEM leftovers

We don't support CONFIG_MEMORY_HOTPLUG on 32 bit and consequently not
HIGHMEM.  Let's remove any leftover code -- including the unused
"status_change_nid_high" field part of the memory notifier.

Link: https://lkml.kernel.org/r/20210929143600.49379-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Alex Shi <alexs@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 0328ec039c38..88eb587b5143 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -96,7 +96,6 @@ struct memory_notify {
 	unsigned long start_pfn;
 	unsigned long nr_pages;
 	int status_change_nid_normal;
-	int status_change_nid_high;
 	int status_change_nid;
 };
 
-- 
cgit v1.2.3


From 43e3aa2a3247c9ca348884866a9846d788ec5ed6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 5 Nov 2021 13:44:35 -0700
Subject: mm/memory_hotplug: remove stale function declarations

These functions no longer exist.

Link: https://lkml.kernel.org/r/20210929143600.49379-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Alex Shi <alexs@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e5a867c950b2..be48e003a518 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -98,9 +98,6 @@ static inline void zone_seqlock_init(struct zone *zone)
 {
 	seqlock_init(&zone->span_seqlock);
 }
-extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
-extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
-extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
 extern void adjust_present_page_count(struct page *page,
 				      struct memory_group *group,
 				      long nr_pages);
-- 
cgit v1.2.3


From e14b41556d9ec20af60a9f672ac6f64dd450763c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 5 Nov 2021 13:44:46 -0700
Subject: memblock: improve MEMBLOCK_HOTPLUG documentation

The description of MEMBLOCK_HOTPLUG is currently short and consequently
misleading: we're actually dealing with a memory region that might get
hotunplugged later (i.e., the platform+firmware supports it), yet it is
indicated in the firmware-provided memory map as system ram that will
just get used by the system for any purpose when not taking special
care.  The firmware marked this memory region as a hot(un)plugged (e.g.,
hotplugged before reboot), implying that it might get hotunplugged again
later.

Whether we consider this information depends on the "movable_node"
kernel commandline parameter: only with "movable_node" set, we'll try
keeping this memory hotunpluggable, for example, by not serving early
allocations from this memory region and by letting the buddy manage it
using the ZONE_MOVABLE.

Let's make this clearer by extending the documentation.

Note: kexec *has to* indicate this memory to the second kernel.  With
"movable_node" set, we don't want to place kexec-images on this memory.
Without "movable_node" set, we don't care and can place kexec-images on
this memory.  In both cases, after successful memory hotunplug, kexec
has to be re-armed to update the memory map for the second kernel and to
place the kexec-images somewhere else.

Link: https://lkml.kernel.org/r/20211004093605.5830-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Jianyong Wu <Jianyong.Wu@arm.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Shahab Vahedi <shahab@synopsys.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 484650681bee..1d2b3e902a84 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -28,7 +28,11 @@ extern unsigned long long max_possible_pfn;
 /**
  * enum memblock_flags - definition of memory region attributes
  * @MEMBLOCK_NONE: no special request
- * @MEMBLOCK_HOTPLUG: hotpluggable region
+ * @MEMBLOCK_HOTPLUG: memory region indicated in the firmware-provided memory
+ * map during early boot as hot(un)pluggable system RAM (e.g., memory range
+ * that might get hotunplugged later). With "movable_node" set on the kernel
+ * commandline, try keeping this memory region hotunpluggable. Does not apply
+ * to memblocks added ("hotplugged") after early boot.
  * @MEMBLOCK_MIRROR: mirrored region
  * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as
  * reserved in the memory map; refer to memblock_mark_nomap() description
-- 
cgit v1.2.3


From 952eea9b01e4bbb7011329f1b7240844e61e5128 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 5 Nov 2021 13:44:49 -0700
Subject: memblock: allow to specify flags with memblock_add_node()

We want to specify flags when hotplugging memory.  Let's prepare to pass
flags to memblock_add_node() by adjusting all existing users.

Note that when hotplugging memory the system is already up and running
and we might have concurrent memblock users: for example, while we're
hotplugging memory, kexec_file code might search for suitable memory
regions to place kexec images.  It's important to add the memory
directly to memblock via a single call with the right flags, instead of
adding the memory first and apply flags later: otherwise, concurrent
memblock users might temporarily stumble over memblocks with wrong
flags, which will be important in a follow-up patch that introduces a
new flag to properly handle add_memory_driver_managed().

Link: https://lkml.kernel.org/r/20211004093605.5830-4-david@redhat.com
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Shahab Vahedi <shahab@synopsys.com>	[arch/arc]
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Jianyong Wu <Jianyong.Wu@arm.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 3 ++-
 include/linux/mm.h       | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 1d2b3e902a84..8b6560dc7a9e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -104,7 +104,8 @@ static inline void memblock_discard(void) {}
 #endif
 
 void memblock_allow_resize(void);
-int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
+int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid,
+		      enum memblock_flags flags);
 int memblock_add(phys_addr_t base, phys_addr_t size);
 int memblock_remove(phys_addr_t base, phys_addr_t size);
 int memblock_phys_free(phys_addr_t base, phys_addr_t size);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7e37c726b9db..15058d9cec99 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2425,7 +2425,7 @@ static inline unsigned long get_num_physpages(void)
  * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
  * 							 max_highmem_pfn};
  * for_each_valid_physical_page_range()
- * 	memblock_add_node(base, size, nid)
+ *	memblock_add_node(base, size, nid, MEMBLOCK_NONE)
  * free_area_init(max_zone_pfns);
  */
 void free_area_init(unsigned long *max_zone_pfn);
-- 
cgit v1.2.3


From f7892d8e288d4b090176f26d9bf7943dbbb639a6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 5 Nov 2021 13:44:53 -0700
Subject: memblock: add MEMBLOCK_DRIVER_MANAGED to mimic
 IORESOURCE_SYSRAM_DRIVER_MANAGED

Let's add a flag that corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED,
indicating that we're dealing with a memory region that is never
indicated in the firmware-provided memory map, but always detected and
added by a driver.

Similar to MEMBLOCK_HOTPLUG, most infrastructure has to treat such
memory regions like ordinary MEMBLOCK_NONE memory regions -- for
example, when selecting memory regions to add to the vmcore for dumping
in the crashkernel via for_each_mem_range().

However, especially kexec_file is not supposed to select such memblocks
via for_each_free_mem_range() / for_each_free_mem_range_reverse() to
place kexec images, similar to how we handle
IORESOURCE_SYSRAM_DRIVER_MANAGED without CONFIG_ARCH_KEEP_MEMBLOCK.

We'll make sure that memory hotplug code sets the flag where applicable
(IORESOURCE_SYSRAM_DRIVER_MANAGED) next.  This prepares architectures
that need CONFIG_ARCH_KEEP_MEMBLOCK, such as arm64, for virtio-mem
support.

Note that kexec *must not* indicate this memory to the second kernel and
*must not* place kexec-images on this memory.  Let's add a comment to
kexec_walk_memblock(), documenting how we handle MEMBLOCK_DRIVER_MANAGED
now just like using IORESOURCE_SYSRAM_DRIVER_MANAGED in
locate_mem_hole_callback() for kexec_walk_resources().

Also note that MEMBLOCK_HOTPLUG cannot be reused due to different
semantics:
	MEMBLOCK_HOTPLUG: memory is indicated as "System RAM" in the
	firmware-provided memory map and added to the system early during
	boot; kexec *has to* indicate this memory to the second kernel and
	can place kexec-images on this memory. After memory hotunplug,
	kexec has to be re-armed. We mostly ignore this flag when
	"movable_node" is not set on the kernel command line, because
	then we're told to not care about hotunpluggability of such
	memory regions.

	MEMBLOCK_DRIVER_MANAGED: memory is not indicated as "System RAM" in
	the firmware-provided memory map; this memory is always detected
	and added to the system by a driver; memory might not actually be
	physically hotunpluggable. kexec *must not* indicate this memory to
	the second kernel and *must not* place kexec-images on this memory.

Link: https://lkml.kernel.org/r/20211004093605.5830-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Jianyong Wu <Jianyong.Wu@arm.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Shahab Vahedi <shahab@synopsys.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 8b6560dc7a9e..7df557b16c1e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -37,12 +37,17 @@ extern unsigned long long max_possible_pfn;
  * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as
  * reserved in the memory map; refer to memblock_mark_nomap() description
  * for further details
+ * @MEMBLOCK_DRIVER_MANAGED: memory region that is always detected and added
+ * via a driver, and never indicated in the firmware-provided memory map as
+ * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the
+ * kernel resource tree.
  */
 enum memblock_flags {
 	MEMBLOCK_NONE		= 0x0,	/* No special request */
 	MEMBLOCK_HOTPLUG	= 0x1,	/* hotpluggable region */
 	MEMBLOCK_MIRROR		= 0x2,	/* mirrored region */
 	MEMBLOCK_NOMAP		= 0x4,	/* don't add to kernel direct mapping */
+	MEMBLOCK_DRIVER_MANAGED = 0x8,	/* always detected via a driver */
 };
 
 /**
@@ -213,7 +218,8 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
  */
 #define for_each_mem_range(i, p_start, p_end) \
 	__for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,	\
-			     MEMBLOCK_HOTPLUG, p_start, p_end, NULL)
+			     MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED, \
+			     p_start, p_end, NULL)
 
 /**
  * for_each_mem_range_rev - reverse iterate through memblock areas from
@@ -224,7 +230,8 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
  */
 #define for_each_mem_range_rev(i, p_start, p_end)			\
 	__for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \
-				 MEMBLOCK_HOTPLUG, p_start, p_end, NULL)
+				 MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED,\
+				 p_start, p_end, NULL)
 
 /**
  * for_each_reserved_mem_range - iterate over all reserved memblock areas
@@ -254,6 +261,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m)
 	return m->flags & MEMBLOCK_NOMAP;
 }
 
+static inline bool memblock_is_driver_managed(struct memblock_region *m)
+{
+	return m->flags & MEMBLOCK_DRIVER_MANAGED;
+}
+
 int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
-- 
cgit v1.2.3


From d2c20e51e3966bc668ef1ef21fbe90704286c8d0 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Fri, 5 Nov 2021 13:45:06 -0700
Subject: mm/highmem: remove deprecated kmap_atomic

kmap_atomic() is being deprecated in favor of kmap_local_page().

Replace the uses of kmap_atomic() within the highmem code.

On profiling clear_huge_page() using ftrace an improvement of 62% was
observed on the below setup.

Setup:-
Below data has been collected on Qualcomm's SM7250 SoC THP enabled
(kernel v4.19.113) with only CPU-0(Cortex-A55) and CPU-7(Cortex-A76)
switched on and set to max frequency, also DDR set to perf governor.

FTRACE Data:-

Base data:-
Number of iterations: 48
Mean of allocation time: 349.5 us
std deviation: 74.5 us

v4 data:-
Number of iterations: 48
Mean of allocation time: 131 us
std deviation: 32.7 us

The following simple userspace experiment to allocate
100MB(BUF_SZ) of pages and writing to it gave us a good insight,
we observed an improvement of 42% in allocation and writing timings.
-------------------------------------------------------------
Test code snippet
-------------------------------------------------------------
      clock_start();
      buf = malloc(BUF_SZ); /* Allocate 100 MB of memory */

        for(i=0; i < BUF_SZ_PAGES; i++)
        {
                *((int *)(buf + (i*PAGE_SIZE))) = 1;
        }
      clock_end();
-------------------------------------------------------------

Malloc test timings for 100MB anon allocation:-

Base data:-
Number of iterations: 100
Mean of allocation time: 31831 us
std deviation: 4286 us

v4 data:-
Number of iterations: 100
Mean of allocation time: 18193 us
std deviation: 4915 us

[willy@infradead.org: fix zero_user_segments()]
  Link: https://lkml.kernel.org/r/YYVhHCJcm2DM2G9u@casper.infradead.org

Link: https://lkml.kernel.org/r/20210204073255.20769-2-prathu.baronia@oneplus.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Prathu Baronia <prathu.baronia@oneplus.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index b4c49f9cc379..31ebee36f26c 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -143,9 +143,9 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
 #ifndef clear_user_highpage
 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
 {
-	void *addr = kmap_atomic(page);
+	void *addr = kmap_local_page(page);
 	clear_user_page(addr, vaddr, page);
-	kunmap_atomic(addr);
+	kunmap_local(addr);
 }
 #endif
 
@@ -177,9 +177,9 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
 
 static inline void clear_highpage(struct page *page)
 {
-	void *kaddr = kmap_atomic(page);
+	void *kaddr = kmap_local_page(page);
 	clear_page(kaddr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 }
 
 #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
@@ -202,7 +202,7 @@ static inline void zero_user_segments(struct page *page,
 		unsigned start1, unsigned end1,
 		unsigned start2, unsigned end2)
 {
-	void *kaddr = kmap_atomic(page);
+	void *kaddr = kmap_local_page(page);
 	unsigned int i;
 
 	BUG_ON(end1 > page_size(page) || end2 > page_size(page));
@@ -213,7 +213,7 @@ static inline void zero_user_segments(struct page *page,
 	if (end2 > start2)
 		memset(kaddr + start2, 0, end2 - start2);
 
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	for (i = 0; i < compound_nr(page); i++)
 		flush_dcache_page(page + i);
 }
@@ -238,11 +238,11 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
 {
 	char *vfrom, *vto;
 
-	vfrom = kmap_atomic(from);
-	vto = kmap_atomic(to);
+	vfrom = kmap_local_page(from);
+	vto = kmap_local_page(to);
 	copy_user_page(vto, vfrom, vaddr, to);
-	kunmap_atomic(vto);
-	kunmap_atomic(vfrom);
+	kunmap_local(vto);
+	kunmap_local(vfrom);
 }
 
 #endif
@@ -253,11 +253,11 @@ static inline void copy_highpage(struct page *to, struct page *from)
 {
 	char *vfrom, *vto;
 
-	vfrom = kmap_atomic(from);
-	vto = kmap_atomic(to);
+	vfrom = kmap_local_page(from);
+	vto = kmap_local_page(to);
 	copy_page(vto, vfrom);
-	kunmap_atomic(vto);
-	kunmap_atomic(vfrom);
+	kunmap_local(vto);
+	kunmap_local(vfrom);
 }
 
 #endif
-- 
cgit v1.2.3


From 53944f171a89dff4e2a3d76f42e6eedb551bb861 Mon Sep 17 00:00:00 2001
From: Stephen Kitt <steve@sk2.org>
Date: Fri, 5 Nov 2021 13:45:18 -0700
Subject: mm: remove HARDENED_USERCOPY_FALLBACK

This has served its purpose and is no longer used.  All usercopy
violations appear to have been handled by now, any remaining instances
(or new bugs) will cause copies to be rejected.

This isn't a direct revert of commit 2d891fbc3bb6 ("usercopy: Allow
strict enforcement of whitelists"); since usercopy_fallback is
effectively 0, the fallback handling is removed too.

This also removes the usercopy_fallback module parameter on slab_common.

Link: https://github.com/KSPP/linux/issues/153
Link: https://lkml.kernel.org/r/20210921061149.1091163-1-steve@sk2.org
Signed-off-by: Stephen Kitt <steve@sk2.org>
Suggested-by: Kees Cook <keescook@chromium.org>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Joel Stanley <joel@jms.id.au>	[defconfig change]
Acked-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: James Morris <jmorris@namei.org>
Cc: "Serge E . Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 837cb16232ef..181045148b06 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -142,8 +142,6 @@ struct mem_cgroup;
 void __init kmem_cache_init(void);
 bool slab_is_available(void);
 
-extern bool usercopy_fallback;
-
 struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
 			unsigned int align, slab_flags_t flags,
 			void (*ctor)(void *));
-- 
cgit v1.2.3


From a1554c002699cbc9ced2e9f44f9c1357181bead3 Mon Sep 17 00:00:00 2001
From: Mianhan Liu <liumh1@shanghaitech.edu.cn>
Date: Fri, 5 Nov 2021 13:45:21 -0700
Subject: include/linux/mm.h: move nr_free_buffer_pages from swap.h to mm.h

nr_free_buffer_pages could be exposed through mm.h instead of swap.h.
The advantage of this change is that it can reduce the obsolete
includes.  For example, net/ipv4/tcp.c wouldn't need swap.h any more
since it has already included mm.h.  Similarly, after checking all the
other files, it comes that tcp.c, udp.c meter.c ,...  follow the same
rule, so these files can have swap.h removed too.

Moreover, after preprocessing all the files that use
nr_free_buffer_pages, it turns out that those files have already
included mm.h.Thus, we can move nr_free_buffer_pages from swap.h to mm.h
safely.  This change will not affect the compilation of other files.

Link: https://lkml.kernel.org/r/20210912133640.1624-1-liumh1@shanghaitech.edu.cn
Signed-off-by: Mianhan Liu <liumh1@shanghaitech.edu.cn>
Cc: Jakub Kicinski <kuba@kernel.org>
CC: Ulf Hansson <ulf.hansson@linaro.org>
Cc: "David S . Miller" <davem@davemloft.net>
Cc: Simon Horman <horms@verge.net.au>
Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h   | 2 ++
 include/linux/swap.h | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 15058d9cec99..b1720aa63727 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -875,6 +875,8 @@ void put_pages_list(struct list_head *pages);
 void split_page(struct page *page, unsigned int order);
 void copy_huge_page(struct page *dst, struct page *src);
 
+unsigned long nr_free_buffer_pages(void);
+
 /*
  * Compound pages have a destructor function.  Provide a
  * prototype for that function and accessor functions.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ba52f3a3478e..53f759a59996 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -335,7 +335,6 @@ void workingset_update_node(struct xa_node *node);
 
 /* linux/mm/page_alloc.c */
 extern unsigned long totalreserve_pages;
-extern unsigned long nr_free_buffer_pages(void);
 
 /* Definition of global_zone_page_state not available yet */
 #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)
-- 
cgit v1.2.3


From f39f21b3ddc7fc0f87eb6dc75ddc81b5bbfb7672 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 5 Nov 2021 13:45:25 -0700
Subject: stacktrace: move filter_irq_stacks() to kernel/stacktrace.c

filter_irq_stacks() has little to do with the stackdepot implementation,
except that it is usually used by users (such as KASAN) of stackdepot to
reduce the stack trace.

However, filter_irq_stacks() itself is not useful without a stack trace
as obtained by stack_trace_save() and friends.

Therefore, move filter_irq_stacks() to kernel/stacktrace.c, so that new
users of filter_irq_stacks() do not have to start depending on
STACKDEPOT only for filter_irq_stacks().

Link: https://lkml.kernel.org/r/20210923104803.2620285-1-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Aleksandr Nogikh <nogikh@google.com>
Cc: Taras Madan <tarasmadan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/stackdepot.h | 2 --
 include/linux/stacktrace.h | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index b2f7e7c6ba54..d29860966bc9 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -25,8 +25,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
 			       unsigned long **entries);
 
-unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
-
 #ifdef CONFIG_STACKDEPOT
 int stack_depot_init(void);
 #else
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 9edecb494e9e..bef158815e83 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -21,6 +21,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
 unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
 				   unsigned int size, unsigned int skipnr);
 unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
 
 /* Internal interfaces. Do not use in generic code */
 #ifdef CONFIG_ARCH_STACKWALK
-- 
cgit v1.2.3


From 07e8481d3c38f461d7b79c1d5c9afe013b162b0c Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 5 Nov 2021 13:45:46 -0700
Subject: kfence: always use static branches to guard kfence_alloc()

Regardless of KFENCE mode (CONFIG_KFENCE_STATIC_KEYS: either using
static keys to gate allocations, or using a simple dynamic branch),
always use a static branch to avoid the dynamic branch in kfence_alloc()
if KFENCE was disabled at boot.

For CONFIG_KFENCE_STATIC_KEYS=n, this now avoids the dynamic branch if
KFENCE was disabled at boot.

To simplify, also unifies the location where kfence_allocation_gate is
read-checked to just be inline in kfence_alloc().

Link: https://lkml.kernel.org/r/20211019102524.2807208-1-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kfence.h | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 3fe6dd8a18c1..4b5e3679a72c 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -14,6 +14,9 @@
 
 #ifdef CONFIG_KFENCE
 
+#include <linux/atomic.h>
+#include <linux/static_key.h>
+
 /*
  * We allocate an even number of pages, as it simplifies calculations to map
  * address to metadata indices; effectively, the very first page serves as an
@@ -22,13 +25,8 @@
 #define KFENCE_POOL_SIZE ((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE)
 extern char *__kfence_pool;
 
-#ifdef CONFIG_KFENCE_STATIC_KEYS
-#include <linux/static_key.h>
 DECLARE_STATIC_KEY_FALSE(kfence_allocation_key);
-#else
-#include <linux/atomic.h>
 extern atomic_t kfence_allocation_gate;
-#endif
 
 /**
  * is_kfence_address() - check if an address belongs to KFENCE pool
@@ -116,13 +114,16 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags);
  */
 static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 {
-#ifdef CONFIG_KFENCE_STATIC_KEYS
-	if (static_branch_unlikely(&kfence_allocation_key))
+#if defined(CONFIG_KFENCE_STATIC_KEYS) || CONFIG_KFENCE_SAMPLE_INTERVAL == 0
+	if (!static_branch_unlikely(&kfence_allocation_key))
+		return NULL;
 #else
-	if (unlikely(!atomic_read(&kfence_allocation_gate)))
+	if (!static_branch_likely(&kfence_allocation_key))
+		return NULL;
 #endif
-		return __kfence_alloc(s, size, flags);
-	return NULL;
+	if (likely(atomic_read(&kfence_allocation_gate)))
+		return NULL;
+	return __kfence_alloc(s, size, flags);
 }
 
 /**
-- 
cgit v1.2.3


From d2f272b35a84ace2ef04334a9822fd726a7f061b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Fri, 5 Nov 2021 13:46:04 -0700
Subject: include/linux/damon.h: fix kernel-doc comments for 'damon_callback'

A few Kernel-doc comments in 'damon.h' are broken.  This fixes them.

Link: https://lkml.kernel.org/r/20210917123958.3819-5-sj@kernel.org
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index d68b67b8d458..755d70804705 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -62,7 +62,7 @@ struct damon_target {
 struct damon_ctx;
 
 /**
- * struct damon_primitive	Monitoring primitives for given use cases.
+ * struct damon_primitive - Monitoring primitives for given use cases.
  *
  * @init:			Initialize primitive-internal data structures.
  * @update:			Update primitive-internal data structures.
@@ -108,8 +108,8 @@ struct damon_primitive {
 	void (*cleanup)(struct damon_ctx *context);
 };
 
-/*
- * struct damon_callback	Monitoring events notification callbacks.
+/**
+ * struct damon_callback - Monitoring events notification callbacks.
  *
  * @before_start:	Called before starting the monitoring.
  * @after_sampling:	Called after each sampling.
-- 
cgit v1.2.3


From fda504fade7f124858d7022341dc46ff35b45274 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:18 -0700
Subject: mm/damon/core: account age of target regions

Patch series "Implement Data Access Monitoring-based Memory Operation Schemes".

Introduction
============

DAMON[1] can be used as a primitive for data access aware memory
management optimizations.  For that, users who want such optimizations
should run DAMON, read the monitoring results, analyze it, plan a new
memory management scheme, and apply the new scheme by themselves.  Such
efforts will be inevitable for some complicated optimizations.

However, in many other cases, the users would simply want the system to
apply a memory management action to a memory region of a specific size
having a specific access frequency for a specific time.  For example,
"page out a memory region larger than 100 MiB keeping only rare accesses
more than 2 minutes", or "Do not use THP for a memory region larger than
2 MiB rarely accessed for more than 1 seconds".

To make the works easier and non-redundant, this patchset implements a
new feature of DAMON, which is called Data Access Monitoring-based
Operation Schemes (DAMOS).  Using the feature, users can describe the
normal schemes in a simple way and ask DAMON to execute those on its
own.

[1] https://damonitor.github.io

Evaluations
===========

DAMOS is accurate and useful for memory management optimizations.  An
experimental DAMON-based operation scheme for THP, 'ethp', removes
76.15% of THP memory overheads while preserving 51.25% of THP speedup.
Another experimental DAMON-based 'proactive reclamation' implementation,
'prcl', reduces 93.38% of residential sets and 23.63% of system memory
footprint while incurring only 1.22% runtime overhead in the best case
(parsec3/freqmine).

NOTE that the experimental THP optimization and proactive reclamation
are not for production but only for proof of concepts.

Please refer to the showcase web site's evaluation document[1] for
detailed evaluation setup and results.

[1] https://damonitor.github.io/doc/html/v34/vm/damon/eval.html

Long-term Support Trees
-----------------------

For people who want to test DAMON but using LTS kernels, there are
another couple of trees based on two latest LTS kernels respectively and
containing the 'damon/master' backports.

- For v5.4.y: https://git.kernel.org/sj/h/damon/for-v5.4.y
- For v5.10.y: https://git.kernel.org/sj/h/damon/for-v5.10.y

Sequence Of Patches
===================

The 1st patch accounts age of each region.  The 2nd patch implements the
core of the DAMON-based operation schemes feature.  The 3rd patch makes
the default monitoring primitives for virtual address spaces to support
the schemes.  From this point, the kernel space users can use DAMOS.
The 4th patch exports the feature to the user space via the debugfs
interface.  The 5th patch implements schemes statistics feature for
easier tuning of the schemes and runtime access pattern analysis, and
the 6th patch adds selftests for these changes.  Finally, the 7th patch
documents this new feature.

This patch (of 7):

DAMON can be used for data access pattern aware memory management
optimizations.  For that, users should run DAMON, read the monitoring
results, analyze it, plan a new memory management scheme, and apply the
new scheme by themselves.  It would not be too hard, but still require
some level of effort.  For complicated cases, this effort is inevitable.

That said, in many cases, users would simply want to apply an actions to
a memory region of a specific size having a specific access frequency
for a specific time.  For example, "page out a memory region larger than
100 MiB but having a low access frequency more than 10 minutes", or "Use
THP for a memory region larger than 2 MiB having a high access frequency
for more than 2 seconds".

For such optimizations, users will need to first account the age of each
region themselves.  To reduce such efforts, this implements a simple age
account of each region in DAMON.  For each aggregation step, DAMON
compares the access frequency with that from last aggregation and reset
the age of the region if the change is significant.  Else, the age is
incremented.  Also, in case of the merge of regions, the region
size-weighted average of the ages is set as the age of merged new
region.

Link: https://lkml.kernel.org/r/20211001125604.29660-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211001125604.29660-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Marco Elver <elver@google.com>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Greg Thelen <gthelen@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: David Rienjes <rientjes@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 755d70804705..3e8215debbd4 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -31,12 +31,22 @@ struct damon_addr_range {
  * @sampling_addr:	Address of the sample for the next access check.
  * @nr_accesses:	Access frequency of this region.
  * @list:		List head for siblings.
+ * @age:		Age of this region.
+ *
+ * @age is initially zero, increased for each aggregation interval, and reset
+ * to zero again if the access frequency is significantly changed.  If two
+ * regions are merged into a new region, both @nr_accesses and @age of the new
+ * region are set as region size-weighted average of those of the two regions.
  */
 struct damon_region {
 	struct damon_addr_range ar;
 	unsigned long sampling_addr;
 	unsigned int nr_accesses;
 	struct list_head list;
+
+	unsigned int age;
+/* private: Internal value for age calculation. */
+	unsigned int last_nr_accesses;
 };
 
 /**
-- 
cgit v1.2.3


From 1f366e421c8f69583ed37b56d86e3747331869c3 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:22 -0700
Subject: mm/damon/core: implement DAMON-based Operation Schemes (DAMOS)

In many cases, users might use DAMON for simple data access aware memory
management optimizations such as applying an operation scheme to a
memory region of a specific size having a specific access frequency for
a specific time.  For example, "page out a memory region larger than 100
MiB but having a low access frequency more than 10 minutes", or "Use THP
for a memory region larger than 2 MiB having a high access frequency for
more than 2 seconds".

Most simple form of the solution would be doing offline data access
pattern profiling using DAMON and modifying the application source code
or system configuration based on the profiling results.  Or, developing
a daemon constructed with two modules (one for access monitoring and the
other for applying memory management actions via mlock(), madvise(),
sysctl, etc) is imaginable.

To avoid users spending their time for implementation of such simple
data access monitoring-based operation schemes, this makes DAMON to
handle such schemes directly.  With this change, users can simply
specify their desired schemes to DAMON.  Then, DAMON will automatically
apply the schemes to the user-specified target processes.

Each of the schemes is composed with conditions for filtering of the
target memory regions and desired memory management action for the
target.  Specifically, the format is::

    <min/max size> <min/max access frequency> <min/max age> <action>

The filtering conditions are size of memory region, number of accesses
to the region monitored by DAMON, and the age of the region.  The age of
region is incremented periodically but reset when its addresses or
access frequency has significantly changed or the action of a scheme was
applied.  For the action, current implementation supports a few of
madvise()-like hints, ``WILLNEED``, ``COLD``, ``PAGEOUT``, ``HUGEPAGE``,
and ``NOHUGEPAGE``.

Because DAMON supports various address spaces and application of the
actions to a monitoring target region is dependent to the type of the
target address space, the application code should be implemented by each
primitives and registered to the framework.  Note that this only
implements the framework part.  Following commit will implement the
action applications for virtual address spaces primitives.

Link: https://lkml.kernel.org/r/20211001125604.29660-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 3e8215debbd4..dbe18b0fb795 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -69,6 +69,48 @@ struct damon_target {
 	struct list_head list;
 };
 
+/**
+ * enum damos_action - Represents an action of a Data Access Monitoring-based
+ * Operation Scheme.
+ *
+ * @DAMOS_WILLNEED:	Call ``madvise()`` for the region with MADV_WILLNEED.
+ * @DAMOS_COLD:		Call ``madvise()`` for the region with MADV_COLD.
+ * @DAMOS_PAGEOUT:	Call ``madvise()`` for the region with MADV_PAGEOUT.
+ * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
+ * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ */
+enum damos_action {
+	DAMOS_WILLNEED,
+	DAMOS_COLD,
+	DAMOS_PAGEOUT,
+	DAMOS_HUGEPAGE,
+	DAMOS_NOHUGEPAGE,
+};
+
+/**
+ * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
+ * @min_sz_region:	Minimum size of target regions.
+ * @max_sz_region:	Maximum size of target regions.
+ * @min_nr_accesses:	Minimum ``->nr_accesses`` of target regions.
+ * @max_nr_accesses:	Maximum ``->nr_accesses`` of target regions.
+ * @min_age_region:	Minimum age of target regions.
+ * @max_age_region:	Maximum age of target regions.
+ * @action:		&damo_action to be applied to the target regions.
+ * @list:		List head for siblings.
+ *
+ * Note that both the minimums and the maximums are inclusive.
+ */
+struct damos {
+	unsigned long min_sz_region;
+	unsigned long max_sz_region;
+	unsigned int min_nr_accesses;
+	unsigned int max_nr_accesses;
+	unsigned int min_age_region;
+	unsigned int max_age_region;
+	enum damos_action action;
+	struct list_head list;
+};
+
 struct damon_ctx;
 
 /**
@@ -79,6 +121,7 @@ struct damon_ctx;
  * @prepare_access_checks:	Prepare next access check of target regions.
  * @check_accesses:		Check the accesses to target regions.
  * @reset_aggregated:		Reset aggregated accesses monitoring results.
+ * @apply_scheme:		Apply a DAMON-based operation scheme.
  * @target_valid:		Determine if the target is valid.
  * @cleanup:			Clean up the context.
  *
@@ -104,6 +147,9 @@ struct damon_ctx;
  * of its update.  The value will be used for regions adjustment threshold.
  * @reset_aggregated should reset the access monitoring results that aggregated
  * by @check_accesses.
+ * @apply_scheme is called from @kdamond when a region for user provided
+ * DAMON-based operation scheme is found.  It should apply the scheme's action
+ * to the region.  This is not used for &DAMON_ARBITRARY_TARGET case.
  * @target_valid should check whether the target is still valid for the
  * monitoring.
  * @cleanup is called from @kdamond just before its termination.
@@ -114,6 +160,8 @@ struct damon_primitive {
 	void (*prepare_access_checks)(struct damon_ctx *context);
 	unsigned int (*check_accesses)(struct damon_ctx *context);
 	void (*reset_aggregated)(struct damon_ctx *context);
+	int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t,
+			struct damon_region *r, struct damos *scheme);
 	bool (*target_valid)(void *target);
 	void (*cleanup)(struct damon_ctx *context);
 };
@@ -192,6 +240,7 @@ struct damon_callback {
  * @min_nr_regions:	The minimum number of adaptive monitoring regions.
  * @max_nr_regions:	The maximum number of adaptive monitoring regions.
  * @adaptive_targets:	Head of monitoring targets (&damon_target) list.
+ * @schemes:		Head of schemes (&damos) list.
  */
 struct damon_ctx {
 	unsigned long sample_interval;
@@ -213,6 +262,7 @@ struct damon_ctx {
 	unsigned long min_nr_regions;
 	unsigned long max_nr_regions;
 	struct list_head adaptive_targets;
+	struct list_head schemes;
 };
 
 #define damon_next_region(r) \
@@ -233,6 +283,12 @@ struct damon_ctx {
 #define damon_for_each_target_safe(t, next, ctx)	\
 	list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list)
 
+#define damon_for_each_scheme(s, ctx) \
+	list_for_each_entry(s, &(ctx)->schemes, list)
+
+#define damon_for_each_scheme_safe(s, next, ctx) \
+	list_for_each_entry_safe(s, next, &(ctx)->schemes, list)
+
 #ifdef CONFIG_DAMON
 
 struct damon_region *damon_new_region(unsigned long start, unsigned long end);
@@ -242,6 +298,14 @@ inline void damon_insert_region(struct damon_region *r,
 void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 
+struct damos *damon_new_scheme(
+		unsigned long min_sz_region, unsigned long max_sz_region,
+		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
+		unsigned int min_age_region, unsigned int max_age_region,
+		enum damos_action action);
+void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
+void damon_destroy_scheme(struct damos *s);
+
 struct damon_target *damon_new_target(unsigned long id);
 void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
 void damon_free_target(struct damon_target *t);
@@ -255,6 +319,8 @@ int damon_set_targets(struct damon_ctx *ctx,
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 		unsigned long aggr_int, unsigned long primitive_upd_int,
 		unsigned long min_nr_reg, unsigned long max_nr_reg);
+int damon_set_schemes(struct damon_ctx *ctx,
+			struct damos **schemes, ssize_t nr_schemes);
 int damon_nr_running_ctxs(void);
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
-- 
cgit v1.2.3


From 6dea8add4d2875b80843e4a4c8acd334a4db8c8f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:25 -0700
Subject: mm/damon/vaddr: support DAMON-based Operation Schemes

This makes DAMON's default primitives for virtual address spaces to
support DAMON-based Operation Schemes (DAMOS) by implementing actions
application functions and registering it to the monitoring context.  The
implementation simply links 'madvise()' for related DAMOS actions.  That
is, 'madvise(MADV_WILLNEED)' is called for 'WILLNEED' DAMOS action and
similar for other actions ('COLD', 'PAGEOUT', 'HUGEPAGE', 'NOHUGEPAGE').

So, the kernel space DAMON users can now use the DAMON-based
optimizations with only small amount of code.

Link: https://lkml.kernel.org/r/20211001125604.29660-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index dbe18b0fb795..be6b6e81e8ee 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -337,6 +337,8 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx);
 unsigned int damon_va_check_accesses(struct damon_ctx *ctx);
 bool damon_va_target_valid(void *t);
 void damon_va_cleanup(struct damon_ctx *ctx);
+int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme);
 void damon_va_set_primitives(struct damon_ctx *ctx);
 
 #endif	/* CONFIG_DAMON_VADDR */
-- 
cgit v1.2.3


From 2f0b548c9f03a78f4ce6ab48986e3108028936a6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:32 -0700
Subject: mm/damon/schemes: implement statistics feature

To tune the DAMON-based operation schemes, knowing how many and how
large regions are affected by each of the schemes will be helful.  Those
stats could be used for not only the tuning, but also monitoring of the
working set size and the number of regions, if the scheme does not
change the program behavior too much.

For the reason, this implements the statistics for the schemes.  The
total number and size of the regions that each scheme is applied are
exported to users via '->stat_count' and '->stat_sz' of 'struct damos'.
Admins can also check the number by reading 'schemes' debugfs file.  The
last two integers now represents the stats.  To allow collecting the
stats without changing the program behavior, this also adds new scheme
action, 'DAMOS_STAT'.  Note that 'DAMOS_STAT' is not only making no
memory operation actions, but also does not reset the age of regions.

Link: https://lkml.kernel.org/r/20211001125604.29660-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index be6b6e81e8ee..f301bb53381c 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -78,6 +78,7 @@ struct damon_target {
  * @DAMOS_PAGEOUT:	Call ``madvise()`` for the region with MADV_PAGEOUT.
  * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ * @DAMOS_STAT:		Do nothing but count the stat.
  */
 enum damos_action {
 	DAMOS_WILLNEED,
@@ -85,6 +86,7 @@ enum damos_action {
 	DAMOS_PAGEOUT,
 	DAMOS_HUGEPAGE,
 	DAMOS_NOHUGEPAGE,
+	DAMOS_STAT,		/* Do nothing but only record the stat */
 };
 
 /**
@@ -96,9 +98,13 @@ enum damos_action {
  * @min_age_region:	Minimum age of target regions.
  * @max_age_region:	Maximum age of target regions.
  * @action:		&damo_action to be applied to the target regions.
+ * @stat_count:		Total number of regions that this scheme is applied.
+ * @stat_sz:		Total size of regions that this scheme is applied.
  * @list:		List head for siblings.
  *
- * Note that both the minimums and the maximums are inclusive.
+ * For each aggregation interval, DAMON applies @action to monitoring target
+ * regions fit in the condition and updates the statistics.  Note that both
+ * the minimums and the maximums are inclusive.
  */
 struct damos {
 	unsigned long min_sz_region;
@@ -108,6 +114,8 @@ struct damos {
 	unsigned int min_age_region;
 	unsigned int max_age_region;
 	enum damos_action action;
+	unsigned long stat_count;
+	unsigned long stat_sz;
 	struct list_head list;
 };
 
-- 
cgit v1.2.3


From a28397beb55b68bd0f15c6778540e8ae1bc26d21 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:56 -0700
Subject: mm/damon: implement primitives for physical address space monitoring

This implements the monitoring primitives for the physical memory
address space.  Internally, it uses the PTE Accessed bit, similar to
that of the virtual address spaces monitoring primitives.  It supports
only user memory pages, as idle pages tracking does.  If the monitoring
target physical memory address range contains non-user memory pages,
access check of the pages will do nothing but simply treat the pages as
not accessed.

Link: https://lkml.kernel.org/r/20211012205711.29216-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index f301bb53381c..715dadd21f7c 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -351,4 +351,14 @@ void damon_va_set_primitives(struct damon_ctx *ctx);
 
 #endif	/* CONFIG_DAMON_VADDR */
 
+#ifdef CONFIG_DAMON_PADDR
+
+/* Monitoring primitives for the physical memory address space */
+void damon_pa_prepare_access_checks(struct damon_ctx *ctx);
+unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
+bool damon_pa_target_valid(void *t);
+void damon_pa_set_primitives(struct damon_ctx *ctx);
+
+#endif	/* CONFIG_DAMON_PADDR */
+
 #endif	/* _DAMON_H */
-- 
cgit v1.2.3


From 57223ac295845b1d72ec1bd02b5fab992b77a021 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:13 -0700
Subject: mm/damon/paddr: support the pageout scheme

Introduction
============

This patchset 1) makes the engine for general data access
pattern-oriented memory management (DAMOS) be more useful for production
environments, and 2) implements a static kernel module for lightweight
proactive reclamation using the engine.

Proactive Reclamation
---------------------

On general memory over-committed systems, proactively reclaiming cold
pages helps saving memory and reducing latency spikes that incurred by
the direct reclaim or the CPU consumption of kswapd, while incurring
only minimal performance degradation[2].

A Free Pages Reporting[8] based memory over-commit virtualization system
would be one more specific use case.  In the system, the guest VMs
reports their free memory to host, and the host reallocates the reported
memory to other guests.  As a result, the system's memory utilization
can be maximized.  However, the guests could be not so memory-frugal,
because some kernel subsystems and user-space applications are designed
to use as much memory as available.  Then, guests would report only
small amount of free memory to host, results in poor memory utilization.
Running the proactive reclamation in such guests could help mitigating
this problem.

Google has also implemented this idea and using it in their data center.
They further proposed upstreaming it in LSFMM'19, and "the general
consensus was that, while this sort of proactive reclaim would be useful
for a number of users, the cost of this particular solution was too high
to consider merging it upstream"[3].  The cost mainly comes from the
coldness tracking.  Roughly speaking, the implementation periodically
scans the 'Accessed' bit of each page.  For the reason, the overhead
linearly increases as the size of the memory and the scanning frequency
grows.  As a result, Google is known to dedicating one CPU for the work.
That's a reasonable option to someone like Google, but it wouldn't be so
to some others.

DAMON and DAMOS: An engine for data access pattern-oriented memory management
-----------------------------------------------------------------------------

DAMON[4] is a framework for general data access monitoring.  Its
adaptive monitoring overhead control feature minimizes its monitoring
overhead.  It also let the upper-bound of the overhead be configurable
by clients, regardless of the size of the monitoring target memory.
While monitoring 70 GiB memory of a production system every 5
milliseconds, it consumes less than 1% single CPU time.  For this, it
could sacrify some of the quality of the monitoring results.
Nevertheless, the lower-bound of the quality is configurable, and it
uses a best-effort algorithm for better quality.  Our test results[5]
show the quality is practical enough.  From the production system
monitoring, we were able to find a 4 KiB region in the 70 GiB memory
that shows highest access frequency.

We normally don't monitor the data access pattern just for fun but to
improve something like memory management.  Proactive reclamation is one
such usage.  For such general cases, DAMON provides a feature called
DAMon-based Operation Schemes (DAMOS)[6].  It makes DAMON an engine for
general data access pattern oriented memory management.  Using this,
clients can ask DAMON to find memory regions of specific data access
pattern and apply some memory management action (e.g., page out, move to
head of the LRU list, use huge page, ...).  We call the request
'scheme'.

Proactive Reclamation on top of DAMON/DAMOS
-------------------------------------------

Therefore, by using DAMON for the cold pages detection, the proactive
reclamation's monitoring overhead issue can be solved.  Actually, we
previously implemented a version of proactive reclamation using DAMOS
and achieved noticeable improvements with our evaluation setup[5].
Nevertheless, it more for a proof-of-concept, rather than production
uses.  It supports only virtual address spaces of processes, and require
additional tuning efforts for given workloads and the hardware.  For the
tuning, we introduced a simple auto-tuning user space tool[8].  Google
is also known to using a ML-based similar approach for their fleets[2].
But, making it just works with intuitive knobs in the kernel would be
helpful for general users.

To this end, this patchset improves DAMOS to be ready for such
production usages, and implements another version of the proactive
reclamation, namely DAMON_RECLAIM, on top of it.

DAMOS Improvements: Aggressiveness Control, Prioritization, and Watermarks
--------------------------------------------------------------------------

First of all, the current version of DAMOS supports only virtual address
spaces.  This patchset makes it supports the physical address space for
the page out action.

Next major problem of the current version of DAMOS is the lack of the
aggressiveness control, which can results in arbitrary overhead.  For
example, if huge memory regions having the data access pattern of
interest are found, applying the requested action to all of the regions
could incur significant overhead.  It can be controlled by tuning the
target data access pattern with manual or automated approaches[2,7].
But, some people would prefer the kernel to just work with only
intuitive tuning or default values.

For such cases, this patchset implements a safeguard, namely time/size
quota.  Using this, the clients can specify up to how much time can be
used for applying the action, and/or up to how much memory regions the
action can be applied within a user-specified time duration.  A followup
question is, to which memory regions should the action applied within
the limits? We implement a simple regions prioritization mechanism for
each action and make DAMOS to apply the action to high priority regions
first.  It also allows clients tune the prioritization mechanism to use
different weights for size, access frequency, and age of memory regions.
This means we could use not only LRU but also LFU or some fancy
algorithms like CAR[9] with lightweight overhead.

Though DAMON is lightweight, someone would want to remove even the cold
pages monitoring overhead when it is unnecessary.  Currently, it should
manually turned on and off by clients, but some clients would simply
want to turn it on and off based on some metrics like free memory ratio
or memory fragmentation.  For such cases, this patchset implements a
watermarks-based automatic activation feature.  It allows the clients
configure the metric of their interest, and three watermarks of the
metric.  If the metric is higher than the high watermark or lower than
the low watermark, the scheme is deactivated.  If the metric is lower
than the mid watermark but higher than the low watermark, the scheme is
activated.

DAMON-based Reclaim
-------------------

Using the improved version of DAMOS, this patchset implements a static
kernel module called 'damon_reclaim'.  It finds memory regions that
didn't accessed for specific time duration and page out.  Consuming too
much CPU for the paging out operations, or doing pageout too frequently
can be critical for systems configuring their swap devices with
software-defined in-memory block devices like zram/zswap or total number
of writes limited devices like SSDs, respectively.  To avoid the
problems, the time/size quotas can be configured.  Under the quotas, it
pages out memory regions that didn't accessed longer first.  Also, to
remove the monitoring overhead under peaceful situation, and to fall
back to the LRU-list based page granularity reclamation when it doesn't
make progress, the three watermarks based activation mechanism is used,
with the free memory ratio as the watermark metric.

For convenient configurations, it provides several module parameters.
Using these, sysadmins can enable/disable it, and tune its parameters
including the coldness identification time threshold, the time/size
quotas and the three watermarks.

Evaluation
==========

In short, DAMON_RECLAIM with 50ms/s time quota and regions
prioritization on v5.15-rc5 Linux kernel with ZRAM swap device achieves
38.58% memory saving with only 1.94% runtime overhead.  For this,
DAMON_RECLAIM consumes only 4.97% of single CPU time.

Setup
-----

We evaluate DAMON_RECLAIM to show how each of the DAMOS improvements
make effect.  For this, we measure DAMON_RECLAIM's CPU consumption,
entire system memory footprint, total number of major page faults, and
runtime of 24 realistic workloads in PARSEC3 and SPLASH-2X benchmark
suites on my QEMU/KVM based virtual machine.  The virtual machine runs
on an i3.metal AWS instance, has 130GiB memory, and runs a linux kernel
built on latest -mm tree[1] plus this patchset.  It also utilizes a 4
GiB ZRAM swap device.  We repeats the measurement 5 times and use
averages.

[1] https://github.com/hnaz/linux-mm/tree/v5.15-rc5-mmots-2021-10-13-19-55

Detailed Results
----------------

The results are summarized in the below table.

With coldness identification threshold of 5 seconds, DAMON_RECLAIM
without the time quota-based speed limit achieves 47.21% memory saving,
but incur 4.59% runtime slowdown to the workloads on average.  For this,
DAMON_RECLAIM consumes about 11.28% single CPU time.

Applying time quotas of 200ms/s, 50ms/s, and 10ms/s without the regions
prioritization reduces the slowdown to 4.89%, 2.65%, and 1.5%,
respectively.  Time quota of 200ms/s (20%) makes no real change compared
to the quota unapplied version, because the quota unapplied version
consumes only 11.28% CPU time.  DAMON_RECLAIM's CPU utilization also
similarly reduced: 11.24%, 5.51%, and 2.01% of single CPU time.  That
is, the overhead is proportional to the speed limit.  Nevertheless, it
also reduces the memory saving because it becomes less aggressive.  In
detail, the three variants show 48.76%, 37.83%, and 7.85% memory saving,
respectively.

Applying the regions prioritization (page out regions that not accessed
longer first within the time quota) further reduces the performance
degradation.  Runtime slowdowns and total number of major page faults
increase has been 4.89%/218,690% -> 4.39%/166,136% (200ms/s),
2.65%/111,886% -> 1.94%/59,053% (50ms/s), and 1.5%/34,973.40% ->
2.08%/8,781.75% (10ms/s).  The runtime under 10ms/s time quota has
increased with prioritization, but apparently that's under the margin of
error.

    time quota   prioritization  memory_saving  cpu_util  slowdown  pgmajfaults overhead
    N            N               47.21%         11.28%    4.59%     194,802%
    200ms/s      N               48.76%         11.24%    4.89%     218,690%
    50ms/s       N               37.83%         5.51%     2.65%     111,886%
    10ms/s       N               7.85%          2.01%     1.5%      34,793.40%
    200ms/s      Y               50.08%         10.38%    4.39%     166,136%
    50ms/s       Y               38.58%         4.97%     1.94%     59,053%
    10ms/s       Y               3.63%          1.73%     2.08%     8,781.75%

Baseline and Complete Git Trees
===============================

The patches are based on the latest -mm tree
(v5.15-rc5-mmots-2021-10-13-19-55).  You can also clone the complete git tree
from:

    $ git clone git://github.com/sjp38/linux -b damon_reclaim/patches/v1

The web is also available:
https://git.kernel.org/pub/scm/linux/kernel/git/sj/linux.git/tag/?h=damon_reclaim/patches/v1

Sequence Of Patches
===================

The first patch makes DAMOS support the physical address space for the
page out action.  Following five patches (patches 2-6) implement the
time/size quotas.  Next four patches (patches 7-10) implement the memory
regions prioritization within the limit.  Then, three following patches
(patches 11-13) implement the watermarks-based schemes activation.

Finally, the last two patches (patches 14-15) implement and document the
DAMON-based reclamation using the advanced DAMOS.

[1] https://www.kernel.org/doc/html/v5.15-rc1/vm/damon/index.html
[2] https://research.google/pubs/pub48551/
[3] https://lwn.net/Articles/787611/
[4] https://damonitor.github.io
[5] https://damonitor.github.io/doc/html/latest/vm/damon/eval.html
[6] https://lore.kernel.org/linux-mm/20211001125604.29660-1-sj@kernel.org/
[7] https://github.com/awslabs/damoos
[8] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html
[9] https://www.usenix.org/conference/fast-04/car-clock-adaptive-replacement

This patch (of 15):

This makes the DAMON primitives for physical address space support the
pageout action for DAMON-based Operation Schemes.  With this commit,
hence, users can easily implement system-level data access-aware
reclamations using DAMOS.

[sj@kernel.org: fix missing-prototype build warning]
  Link: https://lkml.kernel.org/r/20211025064220.13904-1-sj@kernel.org

Link: https://lkml.kernel.org/r/20211019150731.16699-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211019150731.16699-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Marco Elver <elver@google.com>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Greg Thelen <gthelen@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 715dadd21f7c..9a327bc787b5 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -357,6 +357,8 @@ void damon_va_set_primitives(struct damon_ctx *ctx);
 void damon_pa_prepare_access_checks(struct damon_ctx *ctx);
 unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
 bool damon_pa_target_valid(void *t);
+int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme);
 void damon_pa_set_primitives(struct damon_ctx *ctx);
 
 #endif	/* CONFIG_DAMON_PADDR */
-- 
cgit v1.2.3


From 2b8a248d5873343aa16f6c5ede30517693995f13 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:16 -0700
Subject: mm/damon/schemes: implement size quota for schemes application speed
 control

There could be arbitrarily large memory regions fulfilling the target
data access pattern of a DAMON-based operation scheme.  In the case,
applying the action of the scheme could incur too high overhead.  To
provide an intuitive way for avoiding it, this implements a feature
called size quota.  If the quota is set, DAMON tries to apply the action
only up to the given amount of memory regions within a given time
window.

Link: https://lkml.kernel.org/r/20211019150731.16699-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 9a327bc787b5..3a1ce9d9921c 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -89,6 +89,26 @@ enum damos_action {
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 };
 
+/**
+ * struct damos_quota - Controls the aggressiveness of the given scheme.
+ * @sz:			Maximum bytes of memory that the action can be applied.
+ * @reset_interval:	Charge reset interval in milliseconds.
+ *
+ * To avoid consuming too much CPU time or IO resources for applying the
+ * &struct damos->action to large memory, DAMON allows users to set a size
+ * quota.  The quota can be set by writing non-zero values to &sz.  If the size
+ * quota is set, DAMON tries to apply the action only up to &sz bytes within
+ * &reset_interval.
+ */
+struct damos_quota {
+	unsigned long sz;
+	unsigned long reset_interval;
+
+/* private: For charging the quota */
+	unsigned long charged_sz;
+	unsigned long charged_from;
+};
+
 /**
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @min_sz_region:	Minimum size of target regions.
@@ -98,13 +118,20 @@ enum damos_action {
  * @min_age_region:	Minimum age of target regions.
  * @max_age_region:	Maximum age of target regions.
  * @action:		&damo_action to be applied to the target regions.
+ * @quota:		Control the aggressiveness of this scheme.
  * @stat_count:		Total number of regions that this scheme is applied.
  * @stat_sz:		Total size of regions that this scheme is applied.
  * @list:		List head for siblings.
  *
- * For each aggregation interval, DAMON applies @action to monitoring target
- * regions fit in the condition and updates the statistics.  Note that both
- * the minimums and the maximums are inclusive.
+ * For each aggregation interval, DAMON finds regions which fit in the
+ * condition (&min_sz_region, &max_sz_region, &min_nr_accesses,
+ * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to
+ * those.  To avoid consuming too much CPU time or IO resources for the
+ * &action, &quota is used.
+ *
+ * After applying the &action to each region, &stat_count and &stat_sz is
+ * updated to reflect the number of regions and total size of regions that the
+ * &action is applied.
  */
 struct damos {
 	unsigned long min_sz_region;
@@ -114,6 +141,7 @@ struct damos {
 	unsigned int min_age_region;
 	unsigned int max_age_region;
 	enum damos_action action;
+	struct damos_quota quota;
 	unsigned long stat_count;
 	unsigned long stat_sz;
 	struct list_head list;
@@ -310,7 +338,7 @@ struct damos *damon_new_scheme(
 		unsigned long min_sz_region, unsigned long max_sz_region,
 		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
 		unsigned int min_age_region, unsigned int max_age_region,
-		enum damos_action action);
+		enum damos_action action, struct damos_quota *quota);
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
 void damon_destroy_scheme(struct damos *s);
 
-- 
cgit v1.2.3


From 50585192bc2ef9309d32dabdbb5e735679f4f128 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:20 -0700
Subject: mm/damon/schemes: skip already charged targets and regions

If DAMOS has stopped applying action in the middle of a group of memory
regions due to its size quota, it starts the work again from the
beginning of the address space in the next charge window.  If there is a
huge memory region at the beginning of the address space and it fulfills
the scheme's target data access pattern always, the action will applied
to only the region.

This mitigates the case by skipping memory regions that charged in
current charge window at the beginning of next charge window.

Link: https://lkml.kernel.org/r/20211019150731.16699-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 3a1ce9d9921c..585d985768fd 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -107,6 +107,8 @@ struct damos_quota {
 /* private: For charging the quota */
 	unsigned long charged_sz;
 	unsigned long charged_from;
+	struct damon_target *charge_target_from;
+	unsigned long charge_addr_from;
 };
 
 /**
@@ -307,6 +309,9 @@ struct damon_ctx {
 #define damon_prev_region(r) \
 	(container_of(r->list.prev, struct damon_region, list))
 
+#define damon_last_region(t) \
+	(list_last_entry(&t->regions_list, struct damon_region, list))
+
 #define damon_for_each_region(r, t) \
 	list_for_each_entry(r, &t->regions_list, list)
 
-- 
cgit v1.2.3


From 1cd2430300594a230dba9178ac9e286d868d9da2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:23 -0700
Subject: mm/damon/schemes: implement time quota

The size quota feature of DAMOS is useful for IO resource-critical
systems, but not so intuitive for CPU time-critical systems.  Systems
using zram or zswap-like swap device would be examples.

To provide another intuitive ways for such systems, this implements
time-based quota for DAMON-based Operation Schemes.  If the quota is
set, DAMOS tries to use only up to the user-defined quota of CPU time
within a given time window.

Link: https://lkml.kernel.org/r/20211019150731.16699-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 585d985768fd..1e7671bf3d23 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -91,20 +91,35 @@ enum damos_action {
 
 /**
  * struct damos_quota - Controls the aggressiveness of the given scheme.
+ * @ms:			Maximum milliseconds that the scheme can use.
  * @sz:			Maximum bytes of memory that the action can be applied.
  * @reset_interval:	Charge reset interval in milliseconds.
  *
  * To avoid consuming too much CPU time or IO resources for applying the
- * &struct damos->action to large memory, DAMON allows users to set a size
- * quota.  The quota can be set by writing non-zero values to &sz.  If the size
- * quota is set, DAMON tries to apply the action only up to &sz bytes within
- * &reset_interval.
+ * &struct damos->action to large memory, DAMON allows users to set time and/or
+ * size quotas.  The quotas can be set by writing non-zero values to &ms and
+ * &sz, respectively.  If the time quota is set, DAMON tries to use only up to
+ * &ms milliseconds within &reset_interval for applying the action.  If the
+ * size quota is set, DAMON tries to apply the action only up to &sz bytes
+ * within &reset_interval.
+ *
+ * Internally, the time quota is transformed to a size quota using estimated
+ * throughput of the scheme's action.  DAMON then compares it against &sz and
+ * uses smaller one as the effective quota.
  */
 struct damos_quota {
+	unsigned long ms;
 	unsigned long sz;
 	unsigned long reset_interval;
 
-/* private: For charging the quota */
+/* private: */
+	/* For throughput estimation */
+	unsigned long total_charged_sz;
+	unsigned long total_charged_ns;
+
+	unsigned long esz;	/* Effective size quota in bytes */
+
+	/* For charging the quota */
 	unsigned long charged_sz;
 	unsigned long charged_from;
 	struct damon_target *charge_target_from;
-- 
cgit v1.2.3


From 38683e003153f7abfa612d7b7fe147efa4624af2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:33 -0700
Subject: mm/damon/schemes: prioritize regions within the quotas

This makes DAMON apply schemes to regions having higher priority first,
if it cannot apply schemes to all regions due to the quotas.

The prioritization function should be implemented in the monitoring
primitives.  Those would commonly calculate the priority of the region
using attributes of regions, namely 'size', 'nr_accesses', and 'age'.
For example, some primitive would calculate the priority of each region
using a weighted sum of 'nr_accesses' and 'age' of the region.

The optimal weights would depend on give environments, so this makes
those customizable.  Nevertheless, the score calculation functions are
only encouraged to respect the weights, not mandated.

Link: https://lkml.kernel.org/r/20211019150731.16699-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 1e7671bf3d23..5d47ad9e3911 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -14,6 +14,8 @@
 
 /* Minimal region size.  Every damon_region is aligned by this. */
 #define DAMON_MIN_REGION	PAGE_SIZE
+/* Max priority score for DAMON-based operation schemes */
+#define DAMOS_MAX_SCORE		(99)
 
 /**
  * struct damon_addr_range - Represents an address region of [@start, @end).
@@ -95,6 +97,10 @@ enum damos_action {
  * @sz:			Maximum bytes of memory that the action can be applied.
  * @reset_interval:	Charge reset interval in milliseconds.
  *
+ * @weight_sz:		Weight of the region's size for prioritization.
+ * @weight_nr_accesses:	Weight of the region's nr_accesses for prioritization.
+ * @weight_age:		Weight of the region's age for prioritization.
+ *
  * To avoid consuming too much CPU time or IO resources for applying the
  * &struct damos->action to large memory, DAMON allows users to set time and/or
  * size quotas.  The quotas can be set by writing non-zero values to &ms and
@@ -106,12 +112,22 @@ enum damos_action {
  * Internally, the time quota is transformed to a size quota using estimated
  * throughput of the scheme's action.  DAMON then compares it against &sz and
  * uses smaller one as the effective quota.
+ *
+ * For selecting regions within the quota, DAMON prioritizes current scheme's
+ * target memory regions using the &struct damon_primitive->get_scheme_score.
+ * You could customize the prioritization logic by setting &weight_sz,
+ * &weight_nr_accesses, and &weight_age, because monitoring primitives are
+ * encouraged to respect those.
  */
 struct damos_quota {
 	unsigned long ms;
 	unsigned long sz;
 	unsigned long reset_interval;
 
+	unsigned int weight_sz;
+	unsigned int weight_nr_accesses;
+	unsigned int weight_age;
+
 /* private: */
 	/* For throughput estimation */
 	unsigned long total_charged_sz;
@@ -124,6 +140,10 @@ struct damos_quota {
 	unsigned long charged_from;
 	struct damon_target *charge_target_from;
 	unsigned long charge_addr_from;
+
+	/* For prioritization */
+	unsigned long histogram[DAMOS_MAX_SCORE + 1];
+	unsigned int min_score;
 };
 
 /**
@@ -174,6 +194,7 @@ struct damon_ctx;
  * @prepare_access_checks:	Prepare next access check of target regions.
  * @check_accesses:		Check the accesses to target regions.
  * @reset_aggregated:		Reset aggregated accesses monitoring results.
+ * @get_scheme_score:		Get the score of a region for a scheme.
  * @apply_scheme:		Apply a DAMON-based operation scheme.
  * @target_valid:		Determine if the target is valid.
  * @cleanup:			Clean up the context.
@@ -200,6 +221,8 @@ struct damon_ctx;
  * of its update.  The value will be used for regions adjustment threshold.
  * @reset_aggregated should reset the access monitoring results that aggregated
  * by @check_accesses.
+ * @get_scheme_score should return the priority score of a region for a scheme
+ * as an integer in [0, &DAMOS_MAX_SCORE].
  * @apply_scheme is called from @kdamond when a region for user provided
  * DAMON-based operation scheme is found.  It should apply the scheme's action
  * to the region.  This is not used for &DAMON_ARBITRARY_TARGET case.
@@ -213,6 +236,9 @@ struct damon_primitive {
 	void (*prepare_access_checks)(struct damon_ctx *context);
 	unsigned int (*check_accesses)(struct damon_ctx *context);
 	void (*reset_aggregated)(struct damon_ctx *context);
+	int (*get_scheme_score)(struct damon_ctx *context,
+			struct damon_target *t, struct damon_region *r,
+			struct damos *scheme);
 	int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t,
 			struct damon_region *r, struct damos *scheme);
 	bool (*target_valid)(void *target);
-- 
cgit v1.2.3


From 198f0f4c58b9f481e4e51c8c70a6ab9852bbab7f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:37 -0700
Subject: mm/damon/vaddr,paddr: support pageout prioritization

This makes the default monitoring primitives for virtual address spaces
and the physical address sapce to support memory regions prioritization
for 'PAGEOUT' DAMOS action.  It calculates hotness of each region as
weighted sum of 'nr_accesses' and 'age' of the region and get the
priority score as reverse of the hotness, so that cold regions can be
paged out first.

Link: https://lkml.kernel.org/r/20211019150731.16699-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5d47ad9e3911..1217566a0ebc 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -421,6 +421,8 @@ bool damon_va_target_valid(void *t);
 void damon_va_cleanup(struct damon_ctx *ctx);
 int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t,
 		struct damon_region *r, struct damos *scheme);
+int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme);
 void damon_va_set_primitives(struct damon_ctx *ctx);
 
 #endif	/* CONFIG_DAMON_VADDR */
@@ -433,6 +435,8 @@ unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
 bool damon_pa_target_valid(void *t);
 int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t,
 		struct damon_region *r, struct damos *scheme);
+int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme);
 void damon_pa_set_primitives(struct damon_ctx *ctx);
 
 #endif	/* CONFIG_DAMON_PADDR */
-- 
cgit v1.2.3


From ee801b7dd7822a82fd7663048ad649545fac6df3 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:47 -0700
Subject: mm/damon/schemes: activate schemes based on a watermarks mechanism

DAMON-based operation schemes need to be manually turned on and off.  In
some use cases, however, the condition for turning a scheme on and off
would depend on the system's situation.  For example, schemes for
proactive pages reclamation would need to be turned on when some memory
pressure is detected, and turned off when the system has enough free
memory.

For easier control of schemes activation based on the system situation,
this introduces a watermarks-based mechanism.  The client can describe
the watermark metric (e.g., amount of free memory in the system),
watermark check interval, and three watermarks, namely high, mid, and
low.  If the scheme is deactivated, it only gets the metric and compare
that to the three watermarks for every check interval.  If the metric is
higher than the high watermark, the scheme is deactivated.  If the
metric is between the mid watermark and the low watermark, the scheme is
activated.  If the metric is lower than the low watermark, the scheme is
deactivated again.  This is to allow users fall back to traditional
page-granularity mechanisms.

Link: https://lkml.kernel.org/r/20211019150731.16699-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 1217566a0ebc..c93325efddd7 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -146,6 +146,45 @@ struct damos_quota {
 	unsigned int min_score;
 };
 
+/**
+ * enum damos_wmark_metric - Represents the watermark metric.
+ *
+ * @DAMOS_WMARK_NONE:		Ignore the watermarks of the given scheme.
+ * @DAMOS_WMARK_FREE_MEM_RATE:	Free memory rate of the system in [0,1000].
+ */
+enum damos_wmark_metric {
+	DAMOS_WMARK_NONE,
+	DAMOS_WMARK_FREE_MEM_RATE,
+};
+
+/**
+ * struct damos_watermarks - Controls when a given scheme should be activated.
+ * @metric:	Metric for the watermarks.
+ * @interval:	Watermarks check time interval in microseconds.
+ * @high:	High watermark.
+ * @mid:	Middle watermark.
+ * @low:	Low watermark.
+ *
+ * If &metric is &DAMOS_WMARK_NONE, the scheme is always active.  Being active
+ * means DAMON does monitoring and applying the action of the scheme to
+ * appropriate memory regions.  Else, DAMON checks &metric of the system for at
+ * least every &interval microseconds and works as below.
+ *
+ * If &metric is higher than &high, the scheme is inactivated.  If &metric is
+ * between &mid and &low, the scheme is activated.  If &metric is lower than
+ * &low, the scheme is inactivated.
+ */
+struct damos_watermarks {
+	enum damos_wmark_metric metric;
+	unsigned long interval;
+	unsigned long high;
+	unsigned long mid;
+	unsigned long low;
+
+/* private: */
+	bool activated;
+};
+
 /**
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @min_sz_region:	Minimum size of target regions.
@@ -156,6 +195,7 @@ struct damos_quota {
  * @max_age_region:	Maximum age of target regions.
  * @action:		&damo_action to be applied to the target regions.
  * @quota:		Control the aggressiveness of this scheme.
+ * @wmarks:		Watermarks for automated (in)activation of this scheme.
  * @stat_count:		Total number of regions that this scheme is applied.
  * @stat_sz:		Total size of regions that this scheme is applied.
  * @list:		List head for siblings.
@@ -166,6 +206,14 @@ struct damos_quota {
  * those.  To avoid consuming too much CPU time or IO resources for the
  * &action, &quota is used.
  *
+ * To do the work only when needed, schemes can be activated for specific
+ * system situations using &wmarks.  If all schemes that registered to the
+ * monitoring context are inactive, DAMON stops monitoring either, and just
+ * repeatedly checks the watermarks.
+ *
+ * If all schemes that registered to a &struct damon_ctx are inactive, DAMON
+ * stops monitoring and just repeatedly checks the watermarks.
+ *
  * After applying the &action to each region, &stat_count and &stat_sz is
  * updated to reflect the number of regions and total size of regions that the
  * &action is applied.
@@ -179,6 +227,7 @@ struct damos {
 	unsigned int max_age_region;
 	enum damos_action action;
 	struct damos_quota quota;
+	struct damos_watermarks wmarks;
 	unsigned long stat_count;
 	unsigned long stat_sz;
 	struct list_head list;
@@ -384,7 +433,8 @@ struct damos *damon_new_scheme(
 		unsigned long min_sz_region, unsigned long max_sz_region,
 		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
 		unsigned int min_age_region, unsigned int max_age_region,
-		enum damos_action action, struct damos_quota *quota);
+		enum damos_action action, struct damos_quota *quota,
+		struct damos_watermarks *wmarks);
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
 void damon_destroy_scheme(struct damos *s);
 
-- 
cgit v1.2.3


From b5ca3e83ddb05342b1b30700b999cb9b107511f6 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 5 Nov 2021 13:48:07 -0700
Subject: mm/damon/dbgfs: add adaptive_targets list check before enable
 monitor_on

When the ctx->adaptive_targets list is empty, I did some test on
monitor_on interface like this.

    # cat /sys/kernel/debug/damon/target_ids
    #
    # echo on > /sys/kernel/debug/damon/monitor_on
    # damon: kdamond (5390) starts

Though the ctx->adaptive_targets list is empty, but the kthread_run
still be called, and the kdamond.x thread still be created, this is
meaningless.

So there adds a judgment in 'dbgfs_monitor_on_write', if the
ctx->adaptive_targets list is empty, return -EINVAL.

Link: https://lkml.kernel.org/r/0a60a6e8ec9d71989e0848a4dc3311996ca3b5d4.1634720326.git.xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index c93325efddd7..fa7f32614b65 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -440,6 +440,7 @@ void damon_destroy_scheme(struct damos *s);
 
 struct damon_target *damon_new_target(unsigned long id);
 void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
+bool damon_targets_empty(struct damon_ctx *ctx);
 void damon_free_target(struct damon_target *t);
 void damon_destroy_target(struct damon_target *t);
 unsigned int damon_nr_regions(struct damon_target *t);
-- 
cgit v1.2.3


From 0f91d13366a402420bf98eaaf393db03946c13e0 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Fri, 5 Nov 2021 13:48:22 -0700
Subject: mm/damon: simplify stop mechanism

A kernel thread can exit gracefully with kthread_stop().  So we don't
need a new flag 'kdamond_stop'.  And to make sure the task struct is not
freed when accessing it, get reference to it before termination.

Link: https://lkml.kernel.org/r/20211027130517.4404-1-changbin.du@gmail.com
Signed-off-by: Changbin Du <changbin.du@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index fa7f32614b65..321de9d72360 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -381,7 +381,6 @@ struct damon_ctx {
 
 /* public: */
 	struct task_struct *kdamond;
-	bool kdamond_stop;
 	struct mutex kdamond_lock;
 
 	struct damon_primitive primitive;
-- 
cgit v1.2.3


From 658f9ae761b5965893727dd4edcdad56e5a439bb Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Fri, 5 Nov 2021 13:48:27 -0700
Subject: mm/damon: remove return value from before_terminate callback

Since the return value of 'before_terminate' callback is never used, we
make it have no return value.

Link: https://lkml.kernel.org/r/20211029005023.8895-1-changbin.du@gmail.com
Signed-off-by: Changbin Du <changbin.du@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 321de9d72360..b4d4be3cc987 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -322,7 +322,7 @@ struct damon_callback {
 	int (*before_start)(struct damon_ctx *context);
 	int (*after_sampling)(struct damon_ctx *context);
 	int (*after_aggregation)(struct damon_ctx *context);
-	int (*before_terminate)(struct damon_ctx *context);
+	void (*before_terminate)(struct damon_ctx *context);
 };
 
 /**
-- 
cgit v1.2.3


From 7c7e3d31e7856a8260a254f8c71db416f7f9f5a1 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 5 Nov 2021 16:23:29 -0700
Subject: bpf: Introduce helper bpf_find_vma

In some profiler use cases, it is necessary to map an address to the
backing file, e.g., a shared library. bpf_find_vma helper provides a
flexible way to achieve this. bpf_find_vma maps an address of a task to
the vma (vm_area_struct) for this address, and feed the vma to an callback
BPF function. The callback function is necessary here, as we need to
ensure mmap_sem is unlocked.

It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem
safely when irqs are disable, we use the same mechanism as stackmap with
build_id. Specifically, when irqs are disabled, the unlocked is postponed
in an irq_work. Refactor stackmap.c so that the irq_work is shared among
bpf_find_vma and stackmap helpers.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Hengqi Chen <hengqi.chen@gmail.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211105232330.1936330-2-songliubraving@fb.com
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2be6dfd68df9..df3410bff4b0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2157,6 +2157,7 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
+extern const struct bpf_func_proto bpf_find_vma_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
-- 
cgit v1.2.3


From e1c9788cb39777e81ebfbf31ae80b4ec14eb6f6d Mon Sep 17 00:00:00 2001
From: Kotresh HR <khiremat@redhat.com>
Date: Mon, 27 Sep 2021 19:22:27 +0530
Subject: ceph: don't rely on error_string to validate blocklisted session.

The "error_string" in the metadata of MClientSession is being
parsed by kclient to validate whether the session is blocklisted.
The "error_string" is for humans and shouldn't be relied on it.
Hence added the flag to MClientsession to indicate the session
is blocklisted.

[ jlayton: minor formatting cleanup ]

URL: https://tracker.ceph.com/issues/47450
Signed-off-by: Kotresh HR <khiremat@redhat.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index bc2699feddbe..7ad6c3d0db7d 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -302,6 +302,8 @@ enum {
 	CEPH_SESSION_REQUEST_FLUSH_MDLOG,
 };
 
+#define CEPH_SESSION_BLOCKLISTED	(1 << 0)  /* session blocklisted */
+
 extern const char *ceph_session_op_name(int op);
 
 struct ceph_mds_session_head {
-- 
cgit v1.2.3


From aca39d9e86f3edeaac5d2c467f5fd31e0b0df606 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= <lhenriques@suse.de>
Date: Thu, 4 Nov 2021 12:31:46 +0000
Subject: libceph, ceph: move ceph_osdc_copy_from() into cephfs code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch moves ceph_osdc_copy_from() function out of libceph code into
cephfs.  There are no other users for this function, and there is the need
(in another patch) to access internal ceph_osd_request struct members.

Signed-off-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 83fa08a06507..3431011f364d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -475,6 +475,14 @@ extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
 				       u64 expected_object_size,
 				       u64 expected_write_size,
 				       u32 flags);
+extern int osd_req_op_copy_from_init(struct ceph_osd_request *req,
+				     u64 src_snapid, u64 src_version,
+				     struct ceph_object_id *src_oid,
+				     struct ceph_object_locator *src_oloc,
+				     u32 src_fadvise_flags,
+				     u32 dst_fadvise_flags,
+				     u32 truncate_seq, u64 truncate_size,
+				     u8 copy_from_flags);
 
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
@@ -515,17 +523,6 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 		   struct page *req_page, size_t req_len,
 		   struct page **resp_pages, size_t *resp_len);
 
-int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
-			u64 src_snapid, u64 src_version,
-			struct ceph_object_id *src_oid,
-			struct ceph_object_locator *src_oloc,
-			u32 src_fadvise_flags,
-			struct ceph_object_id *dst_oid,
-			struct ceph_object_locator *dst_oloc,
-			u32 dst_fadvise_flags,
-			u32 truncate_seq, u64 truncate_size,
-			u8 copy_from_flags);
-
 /* watch/notify */
 struct ceph_osd_linger_request *
 ceph_osdc_watch(struct ceph_osd_client *osdc,
-- 
cgit v1.2.3


From c6975d7cab5b903aadbc0f78f9af4fae1bd23a50 Mon Sep 17 00:00:00 2001
From: Qian Cai <quic_qiancai@quicinc.com>
Date: Fri, 5 Nov 2021 11:05:09 -0400
Subject: arm64: Track no early_pgtable_alloc() for kmemleak

After switched page size from 64KB to 4KB on several arm64 servers here,
kmemleak starts to run out of early memory pool due to a huge number of
those early_pgtable_alloc() calls:

  kmemleak_alloc_phys()
  memblock_alloc_range_nid()
  memblock_phys_alloc_range()
  early_pgtable_alloc()
  init_pmd()
  alloc_init_pud()
  __create_pgd_mapping()
  __map_memblock()
  paging_init()
  setup_arch()
  start_kernel()

Increased the default value of DEBUG_KMEMLEAK_MEM_POOL_SIZE by 4 times
won't be enough for a server with 200GB+ memory. There isn't much
interesting to check memory leaks for those early page tables and those
early memory mappings should not reference to other memory. Hence, no
kmemleak false positives, and we can safely skip tracking those early
allocations from kmemleak like we did in the commit fed84c785270
("mm/memblock.c: skip kmemleak for kasan_init()") without needing to
introduce complications to automatically scale the value depends on the
runtime memory size etc. After the patch, the default value of
DEBUG_KMEMLEAK_MEM_POOL_SIZE becomes sufficient again.

Signed-off-by: Qian Cai <quic_qiancai@quicinc.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Link: https://lore.kernel.org/r/20211105150509.7826-1-quic_qiancai@quicinc.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/memblock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 34de69b3b8ba..efc896155dae 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -372,7 +372,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r)
 /* Flags for memblock allocation APIs */
 #define MEMBLOCK_ALLOC_ANYWHERE	(~(phys_addr_t)0)
 #define MEMBLOCK_ALLOC_ACCESSIBLE	0
-#define MEMBLOCK_ALLOC_KASAN		1
+#define MEMBLOCK_ALLOC_NOLEAKTRACE	1
 
 /* We are using top down, so it is safe to use 0 here */
 #define MEMBLOCK_LOW_LIMIT 0
-- 
cgit v1.2.3


From dfd5bb23ad75bdabde89ac3166705a450bf16acb Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Wed, 5 May 2021 14:00:06 +0200
Subject: PCI: Export pci_dev_lock()

Commit e3a9b1212b9d ("PCI: Export pci_dev_trylock() and pci_dev_unlock()")
already exported pci_dev_trylock()/pci_dev_unlock() however in some
circumstances such as during error recovery it makes sense to block
waiting to get full access to the device so also export pci_dev_lock().

Link: https://lore.kernel.org/all/20210928181014.GA713179@bhelgaas/
Acked-by: Pierre Morel <pmorel@linux.ibm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index b4dbcc86b3f1..d307b071b65e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1664,6 +1664,7 @@ void pci_cfg_access_lock(struct pci_dev *dev);
 bool pci_cfg_access_trylock(struct pci_dev *dev);
 void pci_cfg_access_unlock(struct pci_dev *dev);
 
+void pci_dev_lock(struct pci_dev *dev);
 int pci_dev_trylock(struct pci_dev *dev);
 void pci_dev_unlock(struct pci_dev *dev);
 
-- 
cgit v1.2.3


From 40a34121ac1dc52ed9cd34a8f4e48e32517a52fd Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 3 Nov 2021 13:47:32 -0700
Subject: bpf, sockmap: Use stricter sk state checks in sk_lookup_assign

In order to fix an issue with sockets in TCP sockmap redirect cases we plan
to allow CLOSE state sockets to exist in the sockmap. However, the check in
bpf_sk_lookup_assign() currently only invalidates sockets in the
TCP_ESTABLISHED case relying on the checks on sockmap insert to ensure we
never SOCK_CLOSE state sockets in the map.

To prepare for this change we flip the logic in bpf_sk_lookup_assign() to
explicitly test for the accepted cases. Namely, a tcp socket in TCP_LISTEN
or a udp socket in TCP_CLOSE state. This also makes the code more resilent
to future changes.

Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20211103204736.248403-2-john.fastabend@gmail.com
---
 include/linux/skmsg.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index b4256847c707..584d94be9c8b 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -507,6 +507,18 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 	return !!psock->saved_data_ready;
 }
 
+static inline bool sk_is_tcp(const struct sock *sk)
+{
+	return sk->sk_type == SOCK_STREAM &&
+	       sk->sk_protocol == IPPROTO_TCP;
+}
+
+static inline bool sk_is_udp(const struct sock *sk)
+{
+	return sk->sk_type == SOCK_DGRAM &&
+	       sk->sk_protocol == IPPROTO_UDP;
+}
+
 #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
 
 #define BPF_F_STRPARSER	(1UL << 1)
-- 
cgit v1.2.3


From cf30f6a5f0c60ec98a637b836bef6915f602c6ab Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Fri, 11 Sep 2020 16:49:00 -0700
Subject: lib: zstd: Add kernel-specific API

This patch:
- Moves `include/linux/zstd.h` -> `include/linux/zstd_lib.h`
- Updates modified zstd headers to yearless copyright
- Adds a new API in `include/linux/zstd.h` that is functionally
  equivalent to the in-use subset of the current API. Functions are
  renamed to avoid symbol collisions with zstd, to make it clear it is
  not the upstream zstd API, and to follow the kernel style guide.
- Updates all callers to use the new API.

There are no functional changes in this patch. Since there are no
functional change, I felt it was okay to update all the callers in a
single patch. Once the API is approved, the callers are mechanically
changed.

This patch is preparing for the 3rd patch in this series, which updates
zstd to version 1.4.10. Since the upstream zstd API is no longer exposed
to callers, the update can happen transparently.

Signed-off-by: Nick Terrell <terrelln@fb.com>
Tested By: Paul Jones <paul@pauljones.id.au>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com> # LLVM/Clang v13.0.0 on x86-64
Tested-by: Jean-Denis Girard <jd.girard@sysnux.pf>
---
 include/linux/zstd.h     | 1243 ++++++++++------------------------------------
 include/linux/zstd_lib.h | 1157 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 1420 insertions(+), 980 deletions(-)
 create mode 100644 include/linux/zstd_lib.h

(limited to 'include/linux')

diff --git a/include/linux/zstd.h b/include/linux/zstd.h
index e87f78c9b19c..9fbc7729b0a0 100644
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@@ -1,138 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of https://github.com/facebook/zstd.
- * An additional grant of patent rights can be found in the PATENTS file in the
- * same directory.
- *
- * This program is free software; you can redistribute it and/or modify it under
- * the terms of the GNU General Public License version 2 as published by the
- * Free Software Foundation. This program is dual-licensed; you may select
- * either version 2 of the GNU General Public License ("GPL") or BSD license
- * ("BSD").
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd) and
+ * the GPLv2 (found in the COPYING file in the root directory of
+ * https://github.com/facebook/zstd). You may select, at your option, one of the
+ * above-listed licenses.
  */
 
-#ifndef ZSTD_H
-#define ZSTD_H
+#ifndef LINUX_ZSTD_H
+#define LINUX_ZSTD_H
 
-/* ======   Dependency   ======*/
-#include <linux/types.h>   /* size_t */
+/**
+ * This is a kernel-style API that wraps the upstream zstd API, which cannot be
+ * used directly because the symbols aren't exported. It exposes the minimal
+ * functionality which is currently required by users of zstd in the kernel.
+ * Expose extra functions from lib/zstd/zstd.h as needed.
+ */
 
+/* ======   Dependency   ====== */
+#include <linux/types.h>
+#include <linux/zstd_lib.h>
 
-/*-*****************************************************************************
- * Introduction
+/* ======   Helper Functions   ====== */
+/**
+ * zstd_compress_bound() - maximum compressed size in worst case scenario
+ * @src_size: The size of the data to compress.
  *
- * zstd, short for Zstandard, is a fast lossless compression algorithm,
- * targeting real-time compression scenarios at zlib-level and better
- * compression ratios. The zstd compression library provides in-memory
- * compression and decompression functions. The library supports compression
- * levels from 1 up to ZSTD_maxCLevel() which is 22. Levels >= 20, labeled
- * ultra, should be used with caution, as they require more memory.
- * Compression can be done in:
- *  - a single step, reusing a context (described as Explicit memory management)
- *  - unbounded multiple steps (described as Streaming compression)
- * The compression ratio achievable on small data can be highly improved using
- * compression with a dictionary in:
- *  - a single step (described as Simple dictionary API)
- *  - a single step, reusing a dictionary (described as Fast dictionary API)
- ******************************************************************************/
-
-/*======  Helper functions  ======*/
+ * Return:    The maximum compressed size in the worst case scenario.
+ */
+size_t zstd_compress_bound(size_t src_size);
 
 /**
- * enum ZSTD_ErrorCode - zstd error codes
+ * zstd_is_error() - tells if a size_t function result is an error code
+ * @code:  The function result to check for error.
  *
- * Functions that return size_t can be checked for errors using ZSTD_isError()
- * and the ZSTD_ErrorCode can be extracted using ZSTD_getErrorCode().
+ * Return: Non-zero iff the code is an error.
+ */
+unsigned int zstd_is_error(size_t code);
+
+/**
+ * enum zstd_error_code - zstd error codes
  */
-typedef enum {
-	ZSTD_error_no_error,
-	ZSTD_error_GENERIC,
-	ZSTD_error_prefix_unknown,
-	ZSTD_error_version_unsupported,
-	ZSTD_error_parameter_unknown,
-	ZSTD_error_frameParameter_unsupported,
-	ZSTD_error_frameParameter_unsupportedBy32bits,
-	ZSTD_error_frameParameter_windowTooLarge,
-	ZSTD_error_compressionParameter_unsupported,
-	ZSTD_error_init_missing,
-	ZSTD_error_memory_allocation,
-	ZSTD_error_stage_wrong,
-	ZSTD_error_dstSize_tooSmall,
-	ZSTD_error_srcSize_wrong,
-	ZSTD_error_corruption_detected,
-	ZSTD_error_checksum_wrong,
-	ZSTD_error_tableLog_tooLarge,
-	ZSTD_error_maxSymbolValue_tooLarge,
-	ZSTD_error_maxSymbolValue_tooSmall,
-	ZSTD_error_dictionary_corrupted,
-	ZSTD_error_dictionary_wrong,
-	ZSTD_error_dictionaryCreation_failed,
-	ZSTD_error_maxCode
-} ZSTD_ErrorCode;
+typedef ZSTD_ErrorCode zstd_error_code;
 
 /**
- * ZSTD_maxCLevel() - maximum compression level available
+ * zstd_get_error_code() - translates an error function result to an error code
+ * @code:  The function result for which zstd_is_error(code) is true.
  *
- * Return: Maximum compression level available.
+ * Return: A unique error code for this error.
  */
-int ZSTD_maxCLevel(void);
+zstd_error_code zstd_get_error_code(size_t code);
+
 /**
- * ZSTD_compressBound() - maximum compressed size in worst case scenario
- * @srcSize: The size of the data to compress.
+ * zstd_get_error_name() - translates an error function result to a string
+ * @code:  The function result for which zstd_is_error(code) is true.
  *
- * Return:   The maximum compressed size in the worst case scenario.
+ * Return: An error string corresponding to the error code.
  */
-size_t ZSTD_compressBound(size_t srcSize);
+const char *zstd_get_error_name(size_t code);
+
 /**
- * ZSTD_isError() - tells if a size_t function result is an error code
- * @code:  The function result to check for error.
+ * zstd_min_clevel() - minimum allowed compression level
  *
- * Return: Non-zero iff the code is an error.
+ * Return: The minimum allowed compression level.
  */
-static __attribute__((unused)) unsigned int ZSTD_isError(size_t code)
-{
-	return code > (size_t)-ZSTD_error_maxCode;
-}
+int zstd_min_clevel(void);
+
 /**
- * ZSTD_getErrorCode() - translates an error function result to a ZSTD_ErrorCode
- * @functionResult: The result of a function for which ZSTD_isError() is true.
+ * zstd_max_clevel() - maximum allowed compression level
  *
- * Return:          The ZSTD_ErrorCode corresponding to the functionResult or 0
- *                  if the functionResult isn't an error.
+ * Return: The maximum allowed compression level.
  */
-static __attribute__((unused)) ZSTD_ErrorCode ZSTD_getErrorCode(
-	size_t functionResult)
-{
-	if (!ZSTD_isError(functionResult))
-		return (ZSTD_ErrorCode)0;
-	return (ZSTD_ErrorCode)(0 - functionResult);
-}
+int zstd_max_clevel(void);
+
+/* ======   Parameter Selection   ====== */
 
 /**
- * enum ZSTD_strategy - zstd compression search strategy
+ * enum zstd_strategy - zstd compression search strategy
  *
- * From faster to stronger.
+ * From faster to stronger. See zstd_lib.h.
  */
-typedef enum {
-	ZSTD_fast,
-	ZSTD_dfast,
-	ZSTD_greedy,
-	ZSTD_lazy,
-	ZSTD_lazy2,
-	ZSTD_btlazy2,
-	ZSTD_btopt,
-	ZSTD_btopt2
-} ZSTD_strategy;
+typedef ZSTD_strategy zstd_strategy;
 
 /**
- * struct ZSTD_compressionParameters - zstd compression parameters
+ * struct zstd_compression_parameters - zstd compression parameters
  * @windowLog:    Log of the largest match distance. Larger means more
  *                compression, and more memory needed during decompression.
- * @chainLog:     Fully searched segment. Larger means more compression, slower,
- *                and more memory (useless for fast).
+ * @chainLog:     Fully searched segment. Larger means more compression,
+ *                slower, and more memory (useless for fast).
  * @hashLog:      Dispatch table. Larger means more compression,
  *                slower, and more memory.
  * @searchLog:    Number of searches. Larger means more compression and slower.
@@ -141,1017 +99,342 @@ typedef enum {
  * @targetLength: Acceptable match size for optimal parser (only). Larger means
  *                more compression, and slower.
  * @strategy:     The zstd compression strategy.
+ *
+ * See zstd_lib.h.
  */
-typedef struct {
-	unsigned int windowLog;
-	unsigned int chainLog;
-	unsigned int hashLog;
-	unsigned int searchLog;
-	unsigned int searchLength;
-	unsigned int targetLength;
-	ZSTD_strategy strategy;
-} ZSTD_compressionParameters;
+typedef ZSTD_compressionParameters zstd_compression_parameters;
 
 /**
- * struct ZSTD_frameParameters - zstd frame parameters
- * @contentSizeFlag: Controls whether content size will be present in the frame
- *                   header (when known).
- * @checksumFlag:    Controls whether a 32-bit checksum is generated at the end
- *                   of the frame for error detection.
- * @noDictIDFlag:    Controls whether dictID will be saved into the frame header
- *                   when using dictionary compression.
+ * struct zstd_frame_parameters - zstd frame parameters
+ * @contentSizeFlag: Controls whether content size will be present in the
+ *                   frame header (when known).
+ * @checksumFlag:    Controls whether a 32-bit checksum is generated at the
+ *                   end of the frame for error detection.
+ * @noDictIDFlag:    Controls whether dictID will be saved into the frame
+ *                   header when using dictionary compression.
  *
- * The default value is all fields set to 0.
+ * The default value is all fields set to 0. See zstd_lib.h.
  */
-typedef struct {
-	unsigned int contentSizeFlag;
-	unsigned int checksumFlag;
-	unsigned int noDictIDFlag;
-} ZSTD_frameParameters;
+typedef ZSTD_frameParameters zstd_frame_parameters;
 
 /**
- * struct ZSTD_parameters - zstd parameters
+ * struct zstd_parameters - zstd parameters
  * @cParams: The compression parameters.
  * @fParams: The frame parameters.
  */
-typedef struct {
-	ZSTD_compressionParameters cParams;
-	ZSTD_frameParameters fParams;
-} ZSTD_parameters;
+typedef ZSTD_parameters zstd_parameters;
 
 /**
- * ZSTD_getCParams() - returns ZSTD_compressionParameters for selected level
- * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
- * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
- * @dictSize:         The dictionary size or 0 if a dictionary isn't being used.
+ * zstd_get_params() - returns zstd_parameters for selected level
+ * @level:              The compression level
+ * @estimated_src_size: The estimated source size to compress or 0
+ *                      if unknown.
  *
- * Return:            The selected ZSTD_compressionParameters.
+ * Return:              The selected zstd_parameters.
  */
-ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel,
-	unsigned long long estimatedSrcSize, size_t dictSize);
+zstd_parameters zstd_get_params(int level,
+	unsigned long long estimated_src_size);
 
-/**
- * ZSTD_getParams() - returns ZSTD_parameters for selected level
- * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
- * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
- * @dictSize:         The dictionary size or 0 if a dictionary isn't being used.
- *
- * The same as ZSTD_getCParams() except also selects the default frame
- * parameters (all zero).
- *
- * Return:            The selected ZSTD_parameters.
- */
-ZSTD_parameters ZSTD_getParams(int compressionLevel,
-	unsigned long long estimatedSrcSize, size_t dictSize);
+/* ======   Single-pass Compression   ====== */
 
-/*-*************************************
- * Explicit memory management
- **************************************/
+typedef ZSTD_CCtx zstd_cctx;
 
 /**
- * ZSTD_CCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_CCtx
- * @cParams: The compression parameters to be used for compression.
+ * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx
+ * @parameters: The compression parameters to be used.
  *
  * If multiple compression parameters might be used, the caller must call
- * ZSTD_CCtxWorkspaceBound() for each set of parameters and use the maximum
+ * zstd_cctx_workspace_bound() for each set of parameters and use the maximum
  * size.
  *
- * Return:   A lower bound on the size of the workspace that is passed to
- *           ZSTD_initCCtx().
+ * Return:      A lower bound on the size of the workspace that is passed to
+ *              zstd_init_cctx().
  */
-size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams);
+size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters);
 
 /**
- * struct ZSTD_CCtx - the zstd compression context
- *
- * When compressing many times it is recommended to allocate a context just once
- * and reuse it for each successive compression operation.
- */
-typedef struct ZSTD_CCtx_s ZSTD_CCtx;
-/**
- * ZSTD_initCCtx() - initialize a zstd compression context
- * @workspace:     The workspace to emplace the context into. It must outlive
- *                 the returned context.
- * @workspaceSize: The size of workspace. Use ZSTD_CCtxWorkspaceBound() to
- *                 determine how large the workspace must be.
- *
- * Return:         A compression context emplaced into workspace.
- */
-ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize);
-
-/**
- * ZSTD_compressCCtx() - compress src into dst
- * @ctx:         The context. Must have been initialized with a workspace at
- *               least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
- * @dst:         The buffer to compress src into.
- * @dstCapacity: The size of the destination buffer. May be any size, but
- *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
- * @src:         The data to compress.
- * @srcSize:     The size of the data to compress.
- * @params:      The parameters to use for compression. See ZSTD_getParams().
- *
- * Return:       The compressed size or an error, which can be checked using
- *               ZSTD_isError().
- */
-size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize, ZSTD_parameters params);
-
-/**
- * ZSTD_DCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_DCtx
- *
- * Return: A lower bound on the size of the workspace that is passed to
- *         ZSTD_initDCtx().
- */
-size_t ZSTD_DCtxWorkspaceBound(void);
-
-/**
- * struct ZSTD_DCtx - the zstd decompression context
- *
- * When decompressing many times it is recommended to allocate a context just
- * once and reuse it for each successive decompression operation.
- */
-typedef struct ZSTD_DCtx_s ZSTD_DCtx;
-/**
- * ZSTD_initDCtx() - initialize a zstd decompression context
- * @workspace:     The workspace to emplace the context into. It must outlive
- *                 the returned context.
- * @workspaceSize: The size of workspace. Use ZSTD_DCtxWorkspaceBound() to
- *                 determine how large the workspace must be.
- *
- * Return:         A decompression context emplaced into workspace.
- */
-ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize);
-
-/**
- * ZSTD_decompressDCtx() - decompress zstd compressed src into dst
- * @ctx:         The decompression context.
- * @dst:         The buffer to decompress src into.
- * @dstCapacity: The size of the destination buffer. Must be at least as large
- *               as the decompressed size. If the caller cannot upper bound the
- *               decompressed size, then it's better to use the streaming API.
- * @src:         The zstd compressed data to decompress. Multiple concatenated
- *               frames and skippable frames are allowed.
- * @srcSize:     The exact size of the data to decompress.
- *
- * Return:       The decompressed size or an error, which can be checked using
- *               ZSTD_isError().
- */
-size_t ZSTD_decompressDCtx(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-
-/*-************************
- * Simple dictionary API
- **************************/
-
-/**
- * ZSTD_compress_usingDict() - compress src into dst using a dictionary
- * @ctx:         The context. Must have been initialized with a workspace at
- *               least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
- * @dst:         The buffer to compress src into.
- * @dstCapacity: The size of the destination buffer. May be any size, but
- *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
- * @src:         The data to compress.
- * @srcSize:     The size of the data to compress.
- * @dict:        The dictionary to use for compression.
- * @dictSize:    The size of the dictionary.
- * @params:      The parameters to use for compression. See ZSTD_getParams().
- *
- * Compression using a predefined dictionary. The same dictionary must be used
- * during decompression.
- *
- * Return:       The compressed size or an error, which can be checked using
- *               ZSTD_isError().
- */
-size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize, const void *dict, size_t dictSize,
-	ZSTD_parameters params);
-
-/**
- * ZSTD_decompress_usingDict() - decompress src into dst using a dictionary
- * @ctx:         The decompression context.
- * @dst:         The buffer to decompress src into.
- * @dstCapacity: The size of the destination buffer. Must be at least as large
- *               as the decompressed size. If the caller cannot upper bound the
- *               decompressed size, then it's better to use the streaming API.
- * @src:         The zstd compressed data to decompress. Multiple concatenated
- *               frames and skippable frames are allowed.
- * @srcSize:     The exact size of the data to decompress.
- * @dict:        The dictionary to use for decompression. The same dictionary
- *               must've been used to compress the data.
- * @dictSize:    The size of the dictionary.
- *
- * Return:       The decompressed size or an error, which can be checked using
- *               ZSTD_isError().
- */
-size_t ZSTD_decompress_usingDict(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize, const void *dict, size_t dictSize);
-
-/*-**************************
- * Fast dictionary API
- ***************************/
-
-/**
- * ZSTD_CDictWorkspaceBound() - memory needed to initialize a ZSTD_CDict
- * @cParams: The compression parameters to be used for compression.
+ * zstd_init_cctx() - initialize a zstd compression context
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspace_size: The size of workspace. Use zstd_cctx_workspace_bound() to
+ *                  determine how large the workspace must be.
  *
- * Return:   A lower bound on the size of the workspace that is passed to
- *           ZSTD_initCDict().
- */
-size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams);
-
-/**
- * struct ZSTD_CDict - a digested dictionary to be used for compression
+ * Return:          A zstd compression context or NULL on error.
  */
-typedef struct ZSTD_CDict_s ZSTD_CDict;
+zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size);
 
 /**
- * ZSTD_initCDict() - initialize a digested dictionary for compression
- * @dictBuffer:    The dictionary to digest. The buffer is referenced by the
- *                 ZSTD_CDict so it must outlive the returned ZSTD_CDict.
- * @dictSize:      The size of the dictionary.
- * @params:        The parameters to use for compression. See ZSTD_getParams().
- * @workspace:     The workspace. It must outlive the returned ZSTD_CDict.
- * @workspaceSize: The workspace size. Must be at least
- *                 ZSTD_CDictWorkspaceBound(params.cParams).
+ * zstd_compress_cctx() - compress src into dst with the initialized parameters
+ * @cctx:         The context. Must have been initialized with zstd_init_cctx().
+ * @dst:          The buffer to compress src into.
+ * @dst_capacity: The size of the destination buffer. May be any size, but
+ *                ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:          The data to compress.
+ * @src_size:     The size of the data to compress.
+ * @parameters:   The compression parameters to be used.
  *
- * When compressing multiple messages / blocks with the same dictionary it is
- * recommended to load it just once. The ZSTD_CDict merely references the
- * dictBuffer, so it must outlive the returned ZSTD_CDict.
- *
- * Return:         The digested dictionary emplaced into workspace.
+ * Return:        The compressed size or an error, which can be checked using
+ *                zstd_is_error().
  */
-ZSTD_CDict *ZSTD_initCDict(const void *dictBuffer, size_t dictSize,
-	ZSTD_parameters params, void *workspace, size_t workspaceSize);
+size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity,
+	const void *src, size_t src_size, const zstd_parameters *parameters);
 
-/**
- * ZSTD_compress_usingCDict() - compress src into dst using a ZSTD_CDict
- * @ctx:         The context. Must have been initialized with a workspace at
- *               least as large as ZSTD_CCtxWorkspaceBound(cParams) where
- *               cParams are the compression parameters used to initialize the
- *               cdict.
- * @dst:         The buffer to compress src into.
- * @dstCapacity: The size of the destination buffer. May be any size, but
- *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
- * @src:         The data to compress.
- * @srcSize:     The size of the data to compress.
- * @cdict:       The digested dictionary to use for compression.
- * @params:      The parameters to use for compression. See ZSTD_getParams().
- *
- * Compression using a digested dictionary. The same dictionary must be used
- * during decompression.
- *
- * Return:       The compressed size or an error, which can be checked using
- *               ZSTD_isError().
- */
-size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize, const ZSTD_CDict *cdict);
+/* ======   Single-pass Decompression   ====== */
 
+typedef ZSTD_DCtx zstd_dctx;
 
 /**
- * ZSTD_DDictWorkspaceBound() - memory needed to initialize a ZSTD_DDict
+ * zstd_dctx_workspace_bound() - max memory needed to initialize a zstd_dctx
  *
- * Return:  A lower bound on the size of the workspace that is passed to
- *          ZSTD_initDDict().
- */
-size_t ZSTD_DDictWorkspaceBound(void);
-
-/**
- * struct ZSTD_DDict - a digested dictionary to be used for decompression
+ * Return: A lower bound on the size of the workspace that is passed to
+ *         zstd_init_dctx().
  */
-typedef struct ZSTD_DDict_s ZSTD_DDict;
+size_t zstd_dctx_workspace_bound(void);
 
 /**
- * ZSTD_initDDict() - initialize a digested dictionary for decompression
- * @dictBuffer:    The dictionary to digest. The buffer is referenced by the
- *                 ZSTD_DDict so it must outlive the returned ZSTD_DDict.
- * @dictSize:      The size of the dictionary.
- * @workspace:     The workspace. It must outlive the returned ZSTD_DDict.
- * @workspaceSize: The workspace size. Must be at least
- *                 ZSTD_DDictWorkspaceBound().
- *
- * When decompressing multiple messages / blocks with the same dictionary it is
- * recommended to load it just once. The ZSTD_DDict merely references the
- * dictBuffer, so it must outlive the returned ZSTD_DDict.
+ * zstd_init_dctx() - initialize a zstd decompression context
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspace_size: The size of workspace. Use zstd_dctx_workspace_bound() to
+ *                  determine how large the workspace must be.
  *
- * Return:         The digested dictionary emplaced into workspace.
+ * Return:          A zstd decompression context or NULL on error.
  */
-ZSTD_DDict *ZSTD_initDDict(const void *dictBuffer, size_t dictSize,
-	void *workspace, size_t workspaceSize);
+zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size);
 
 /**
- * ZSTD_decompress_usingDDict() - decompress src into dst using a ZSTD_DDict
- * @ctx:         The decompression context.
- * @dst:         The buffer to decompress src into.
- * @dstCapacity: The size of the destination buffer. Must be at least as large
- *               as the decompressed size. If the caller cannot upper bound the
- *               decompressed size, then it's better to use the streaming API.
- * @src:         The zstd compressed data to decompress. Multiple concatenated
- *               frames and skippable frames are allowed.
- * @srcSize:     The exact size of the data to decompress.
- * @ddict:       The digested dictionary to use for decompression. The same
- *               dictionary must've been used to compress the data.
+ * zstd_decompress_dctx() - decompress zstd compressed src into dst
+ * @dctx:         The decompression context.
+ * @dst:          The buffer to decompress src into.
+ * @dst_capacity: The size of the destination buffer. Must be at least as large
+ *                as the decompressed size. If the caller cannot upper bound the
+ *                decompressed size, then it's better to use the streaming API.
+ * @src:          The zstd compressed data to decompress. Multiple concatenated
+ *                frames and skippable frames are allowed.
+ * @src_size:     The exact size of the data to decompress.
  *
- * Return:       The decompressed size or an error, which can be checked using
- *               ZSTD_isError().
+ * Return:        The decompressed size or an error, which can be checked using
+ *                zstd_is_error().
  */
-size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst,
-	size_t dstCapacity, const void *src, size_t srcSize,
-	const ZSTD_DDict *ddict);
+size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity,
+	const void *src, size_t src_size);
 
-
-/*-**************************
- * Streaming
- ***************************/
+/* ======   Streaming Buffers   ====== */
 
 /**
- * struct ZSTD_inBuffer - input buffer for streaming
+ * struct zstd_in_buffer - input buffer for streaming
  * @src:  Start of the input buffer.
  * @size: Size of the input buffer.
  * @pos:  Position where reading stopped. Will be updated.
  *        Necessarily 0 <= pos <= size.
+ *
+ * See zstd_lib.h.
  */
-typedef struct ZSTD_inBuffer_s {
-	const void *src;
-	size_t size;
-	size_t pos;
-} ZSTD_inBuffer;
+typedef ZSTD_inBuffer zstd_in_buffer;
 
 /**
- * struct ZSTD_outBuffer - output buffer for streaming
+ * struct zstd_out_buffer - output buffer for streaming
  * @dst:  Start of the output buffer.
  * @size: Size of the output buffer.
  * @pos:  Position where writing stopped. Will be updated.
  *        Necessarily 0 <= pos <= size.
+ *
+ * See zstd_lib.h.
  */
-typedef struct ZSTD_outBuffer_s {
-	void *dst;
-	size_t size;
-	size_t pos;
-} ZSTD_outBuffer;
+typedef ZSTD_outBuffer zstd_out_buffer;
 
+/* ======   Streaming Compression   ====== */
 
-
-/*-*****************************************************************************
- * Streaming compression - HowTo
- *
- * A ZSTD_CStream object is required to track streaming operation.
- * Use ZSTD_initCStream() to initialize a ZSTD_CStream object.
- * ZSTD_CStream objects can be reused multiple times on consecutive compression
- * operations. It is recommended to re-use ZSTD_CStream in situations where many
- * streaming operations will be achieved consecutively. Use one separate
- * ZSTD_CStream per thread for parallel execution.
- *
- * Use ZSTD_compressStream() repetitively to consume input stream.
- * The function will automatically update both `pos` fields.
- * Note that it may not consume the entire input, in which case `pos < size`,
- * and it's up to the caller to present again remaining data.
- * It returns a hint for the preferred number of bytes to use as an input for
- * the next function call.
- *
- * At any moment, it's possible to flush whatever data remains within internal
- * buffer, using ZSTD_flushStream(). `output->pos` will be updated. There might
- * still be some content left within the internal buffer if `output->size` is
- * too small. It returns the number of bytes left in the internal buffer and
- * must be called until it returns 0.
- *
- * ZSTD_endStream() instructs to finish a frame. It will perform a flush and
- * write frame epilogue. The epilogue is required for decoders to consider a
- * frame completed. Similar to ZSTD_flushStream(), it may not be able to flush
- * the full content if `output->size` is too small. In which case, call again
- * ZSTD_endStream() to complete the flush. It returns the number of bytes left
- * in the internal buffer and must be called until it returns 0.
- ******************************************************************************/
+typedef ZSTD_CStream zstd_cstream;
 
 /**
- * ZSTD_CStreamWorkspaceBound() - memory needed to initialize a ZSTD_CStream
- * @cParams: The compression parameters to be used for compression.
+ * zstd_cstream_workspace_bound() - memory needed to initialize a zstd_cstream
+ * @cparams: The compression parameters to be used for compression.
  *
  * Return:   A lower bound on the size of the workspace that is passed to
- *           ZSTD_initCStream() and ZSTD_initCStream_usingCDict().
- */
-size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams);
-
-/**
- * struct ZSTD_CStream - the zstd streaming compression context
+ *           zstd_init_cstream().
  */
-typedef struct ZSTD_CStream_s ZSTD_CStream;
+size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams);
 
-/*===== ZSTD_CStream management functions =====*/
 /**
- * ZSTD_initCStream() - initialize a zstd streaming compression context
- * @params:         The zstd compression parameters.
- * @pledgedSrcSize: If params.fParams.contentSizeFlag == 1 then the caller must
- *                  pass the source size (zero means empty source). Otherwise,
- *                  the caller may optionally pass the source size, or zero if
- *                  unknown.
- * @workspace:      The workspace to emplace the context into. It must outlive
- *                  the returned context.
- * @workspaceSize:  The size of workspace.
- *                  Use ZSTD_CStreamWorkspaceBound(params.cParams) to determine
- *                  how large the workspace must be.
+ * zstd_init_cstream() - initialize a zstd streaming compression context
+ * @parameters        The zstd parameters to use for compression.
+ * @pledged_src_size: If params.fParams.contentSizeFlag == 1 then the caller
+ *                    must pass the source size (zero means empty source).
+ *                    Otherwise, the caller may optionally pass the source
+ *                    size, or zero if unknown.
+ * @workspace:        The workspace to emplace the context into. It must outlive
+ *                    the returned context.
+ * @workspace_size:   The size of workspace.
+ *                    Use zstd_cstream_workspace_bound(params->cparams) to
+ *                    determine how large the workspace must be.
  *
- * Return:          The zstd streaming compression context.
+ * Return:            The zstd streaming compression context or NULL on error.
  */
-ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params,
-	unsigned long long pledgedSrcSize, void *workspace,
-	size_t workspaceSize);
+zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters,
+	unsigned long long pledged_src_size, void *workspace, size_t workspace_size);
 
 /**
- * ZSTD_initCStream_usingCDict() - initialize a streaming compression context
- * @cdict:          The digested dictionary to use for compression.
- * @pledgedSrcSize: Optionally the source size, or zero if unknown.
- * @workspace:      The workspace to emplace the context into. It must outlive
- *                  the returned context.
- * @workspaceSize:  The size of workspace. Call ZSTD_CStreamWorkspaceBound()
- *                  with the cParams used to initialize the cdict to determine
- *                  how large the workspace must be.
- *
- * Return:          The zstd streaming compression context.
- */
-ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict,
-	unsigned long long pledgedSrcSize, void *workspace,
-	size_t workspaceSize);
-
-/*===== Streaming compression functions =====*/
-/**
- * ZSTD_resetCStream() - reset the context using parameters from creation
- * @zcs:            The zstd streaming compression context to reset.
- * @pledgedSrcSize: Optionally the source size, or zero if unknown.
+ * zstd_reset_cstream() - reset the context using parameters from creation
+ * @cstream:          The zstd streaming compression context to reset.
+ * @pledged_src_size: Optionally the source size, or zero if unknown.
  *
  * Resets the context using the parameters from creation. Skips dictionary
- * loading, since it can be reused. If `pledgedSrcSize` is non-zero the frame
+ * loading, since it can be reused. If `pledged_src_size` is non-zero the frame
  * content size is always written into the frame header.
  *
- * Return:          Zero or an error, which can be checked using ZSTD_isError().
+ * Return:            Zero or an error, which can be checked using
+ *                    zstd_is_error().
  */
-size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize);
+size_t zstd_reset_cstream(zstd_cstream *cstream,
+	unsigned long long pledged_src_size);
+
 /**
- * ZSTD_compressStream() - streaming compress some of input into output
- * @zcs:    The zstd streaming compression context.
- * @output: Destination buffer. `output->pos` is updated to indicate how much
- *          compressed data was written.
- * @input:  Source buffer. `input->pos` is updated to indicate how much data was
- *          read. Note that it may not consume the entire input, in which case
- *          `input->pos < input->size`, and it's up to the caller to present
- *          remaining data again.
+ * zstd_compress_stream() - streaming compress some of input into output
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
+ * @input:   Source buffer. `input->pos` is updated to indicate how much data
+ *           was read. Note that it may not consume the entire input, in which
+ *           case `input->pos < input->size`, and it's up to the caller to
+ *           present remaining data again.
  *
  * The `input` and `output` buffers may be any size. Guaranteed to make some
  * forward progress if `input` and `output` are not empty.
  *
- * Return:  A hint for the number of bytes to use as the input for the next
- *          function call or an error, which can be checked using
- *          ZSTD_isError().
+ * Return:   A hint for the number of bytes to use as the input for the next
+ *           function call or an error, which can be checked using
+ *           zstd_is_error().
  */
-size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output,
-	ZSTD_inBuffer *input);
+size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output,
+	zstd_in_buffer *input);
+
 /**
- * ZSTD_flushStream() - flush internal buffers into output
- * @zcs:    The zstd streaming compression context.
- * @output: Destination buffer. `output->pos` is updated to indicate how much
- *          compressed data was written.
+ * zstd_flush_stream() - flush internal buffers into output
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
  *
- * ZSTD_flushStream() must be called until it returns 0, meaning all the data
- * has been flushed. Since ZSTD_flushStream() causes a block to be ended,
+ * zstd_flush_stream() must be called until it returns 0, meaning all the data
+ * has been flushed. Since zstd_flush_stream() causes a block to be ended,
  * calling it too often will degrade the compression ratio.
  *
- * Return:  The number of bytes still present within internal buffers or an
- *          error, which can be checked using ZSTD_isError().
- */
-size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
-/**
- * ZSTD_endStream() - flush internal buffers into output and end the frame
- * @zcs:    The zstd streaming compression context.
- * @output: Destination buffer. `output->pos` is updated to indicate how much
- *          compressed data was written.
- *
- * ZSTD_endStream() must be called until it returns 0, meaning all the data has
- * been flushed and the frame epilogue has been written.
- *
- * Return:  The number of bytes still present within internal buffers or an
- *          error, which can be checked using ZSTD_isError().
+ * Return:   The number of bytes still present within internal buffers or an
+ *           error, which can be checked using zstd_is_error().
  */
-size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
+size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output);
 
 /**
- * ZSTD_CStreamInSize() - recommended size for the input buffer
- *
- * Return: The recommended size for the input buffer.
- */
-size_t ZSTD_CStreamInSize(void);
-/**
- * ZSTD_CStreamOutSize() - recommended size for the output buffer
+ * zstd_end_stream() - flush internal buffers into output and end the frame
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
  *
- * When the output buffer is at least this large, it is guaranteed to be large
- * enough to flush at least one complete compressed block.
+ * zstd_end_stream() must be called until it returns 0, meaning all the data has
+ * been flushed and the frame epilogue has been written.
  *
- * Return: The recommended size for the output buffer.
+ * Return:   The number of bytes still present within internal buffers or an
+ *           error, which can be checked using zstd_is_error().
  */
-size_t ZSTD_CStreamOutSize(void);
+size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output);
 
+/* ======   Streaming Decompression   ====== */
 
-
-/*-*****************************************************************************
- * Streaming decompression - HowTo
- *
- * A ZSTD_DStream object is required to track streaming operations.
- * Use ZSTD_initDStream() to initialize a ZSTD_DStream object.
- * ZSTD_DStream objects can be re-used multiple times.
- *
- * Use ZSTD_decompressStream() repetitively to consume your input.
- * The function will update both `pos` fields.
- * If `input->pos < input->size`, some input has not been consumed.
- * It's up to the caller to present again remaining data.
- * If `output->pos < output->size`, decoder has flushed everything it could.
- * Returns 0 iff a frame is completely decoded and fully flushed.
- * Otherwise it returns a suggested next input size that will never load more
- * than the current frame.
- ******************************************************************************/
+typedef ZSTD_DStream zstd_dstream;
 
 /**
- * ZSTD_DStreamWorkspaceBound() - memory needed to initialize a ZSTD_DStream
- * @maxWindowSize: The maximum window size allowed for compressed frames.
+ * zstd_dstream_workspace_bound() - memory needed to initialize a zstd_dstream
+ * @max_window_size: The maximum window size allowed for compressed frames.
  *
- * Return:         A lower bound on the size of the workspace that is passed to
- *                 ZSTD_initDStream() and ZSTD_initDStream_usingDDict().
+ * Return:           A lower bound on the size of the workspace that is passed
+ *                   to zstd_init_dstream().
  */
-size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize);
+size_t zstd_dstream_workspace_bound(size_t max_window_size);
 
 /**
- * struct ZSTD_DStream - the zstd streaming decompression context
- */
-typedef struct ZSTD_DStream_s ZSTD_DStream;
-/*===== ZSTD_DStream management functions =====*/
-/**
- * ZSTD_initDStream() - initialize a zstd streaming decompression context
- * @maxWindowSize: The maximum window size allowed for compressed frames.
- * @workspace:     The workspace to emplace the context into. It must outlive
- *                 the returned context.
- * @workspaceSize: The size of workspace.
- *                 Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
- *                 how large the workspace must be.
- *
- * Return:         The zstd streaming decompression context.
- */
-ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace,
-	size_t workspaceSize);
-/**
- * ZSTD_initDStream_usingDDict() - initialize streaming decompression context
- * @maxWindowSize: The maximum window size allowed for compressed frames.
- * @ddict:         The digested dictionary to use for decompression.
- * @workspace:     The workspace to emplace the context into. It must outlive
- *                 the returned context.
- * @workspaceSize: The size of workspace.
- *                 Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
- *                 how large the workspace must be.
+ * zstd_init_dstream() - initialize a zstd streaming decompression context
+ * @max_window_size: The maximum window size allowed for compressed frames.
+ * @workspace:       The workspace to emplace the context into. It must outlive
+ *                   the returned context.
+ * @workspaceSize:   The size of workspace.
+ *                   Use zstd_dstream_workspace_bound(max_window_size) to
+ *                   determine how large the workspace must be.
  *
- * Return:         The zstd streaming decompression context.
+ * Return:           The zstd streaming decompression context.
  */
-ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize,
-	const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize);
+zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace,
+	size_t workspace_size);
 
-/*===== Streaming decompression functions =====*/
 /**
- * ZSTD_resetDStream() - reset the context using parameters from creation
- * @zds:   The zstd streaming decompression context to reset.
+ * zstd_reset_dstream() - reset the context using parameters from creation
+ * @dstream: The zstd streaming decompression context to reset.
  *
  * Resets the context using the parameters from creation. Skips dictionary
  * loading, since it can be reused.
  *
- * Return: Zero or an error, which can be checked using ZSTD_isError().
+ * Return:   Zero or an error, which can be checked using zstd_is_error().
  */
-size_t ZSTD_resetDStream(ZSTD_DStream *zds);
+size_t zstd_reset_dstream(zstd_dstream *dstream);
+
 /**
- * ZSTD_decompressStream() - streaming decompress some of input into output
- * @zds:    The zstd streaming decompression context.
- * @output: Destination buffer. `output.pos` is updated to indicate how much
- *          decompressed data was written.
- * @input:  Source buffer. `input.pos` is updated to indicate how much data was
- *          read. Note that it may not consume the entire input, in which case
- *          `input.pos < input.size`, and it's up to the caller to present
- *          remaining data again.
+ * zstd_decompress_stream() - streaming decompress some of input into output
+ * @dstream: The zstd streaming decompression context.
+ * @output:  Destination buffer. `output.pos` is updated to indicate how much
+ *           decompressed data was written.
+ * @input:   Source buffer. `input.pos` is updated to indicate how much data was
+ *           read. Note that it may not consume the entire input, in which case
+ *           `input.pos < input.size`, and it's up to the caller to present
+ *           remaining data again.
  *
  * The `input` and `output` buffers may be any size. Guaranteed to make some
  * forward progress if `input` and `output` are not empty.
- * ZSTD_decompressStream() will not consume the last byte of the frame until
+ * zstd_decompress_stream() will not consume the last byte of the frame until
  * the entire frame is flushed.
  *
- * Return:  Returns 0 iff a frame is completely decoded and fully flushed.
- *          Otherwise returns a hint for the number of bytes to use as the input
- *          for the next function call or an error, which can be checked using
- *          ZSTD_isError(). The size hint will never load more than the frame.
- */
-size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output,
-	ZSTD_inBuffer *input);
-
-/**
- * ZSTD_DStreamInSize() - recommended size for the input buffer
- *
- * Return: The recommended size for the input buffer.
- */
-size_t ZSTD_DStreamInSize(void);
-/**
- * ZSTD_DStreamOutSize() - recommended size for the output buffer
- *
- * When the output buffer is at least this large, it is guaranteed to be large
- * enough to flush at least one complete decompressed block.
- *
- * Return: The recommended size for the output buffer.
- */
-size_t ZSTD_DStreamOutSize(void);
-
-
-/* --- Constants ---*/
-#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
-#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
-
-#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
-#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
-
-#define ZSTD_WINDOWLOG_MAX_32  27
-#define ZSTD_WINDOWLOG_MAX_64  27
-#define ZSTD_WINDOWLOG_MAX \
-	((unsigned int)(sizeof(size_t) == 4 \
-		? ZSTD_WINDOWLOG_MAX_32 \
-		: ZSTD_WINDOWLOG_MAX_64))
-#define ZSTD_WINDOWLOG_MIN 10
-#define ZSTD_HASHLOG_MAX ZSTD_WINDOWLOG_MAX
-#define ZSTD_HASHLOG_MIN        6
-#define ZSTD_CHAINLOG_MAX     (ZSTD_WINDOWLOG_MAX+1)
-#define ZSTD_CHAINLOG_MIN      ZSTD_HASHLOG_MIN
-#define ZSTD_HASHLOG3_MAX      17
-#define ZSTD_SEARCHLOG_MAX    (ZSTD_WINDOWLOG_MAX-1)
-#define ZSTD_SEARCHLOG_MIN      1
-/* only for ZSTD_fast, other strategies are limited to 6 */
-#define ZSTD_SEARCHLENGTH_MAX   7
-/* only for ZSTD_btopt, other strategies are limited to 4 */
-#define ZSTD_SEARCHLENGTH_MIN   3
-#define ZSTD_TARGETLENGTH_MIN   4
-#define ZSTD_TARGETLENGTH_MAX 999
-
-/* for static allocation */
-#define ZSTD_FRAMEHEADERSIZE_MAX 18
-#define ZSTD_FRAMEHEADERSIZE_MIN  6
-#define ZSTD_frameHeaderSize_prefix 5
-#define ZSTD_frameHeaderSize_min ZSTD_FRAMEHEADERSIZE_MIN
-#define ZSTD_frameHeaderSize_max ZSTD_FRAMEHEADERSIZE_MAX
-/* magic number + skippable frame length */
-#define ZSTD_skippableHeaderSize 8
-
-
-/*-*************************************
- * Compressed size functions
- **************************************/
-
-/**
- * ZSTD_findFrameCompressedSize() - returns the size of a compressed frame
- * @src:     Source buffer. It should point to the start of a zstd encoded frame
- *           or a skippable frame.
- * @srcSize: The size of the source buffer. It must be at least as large as the
- *           size of the frame.
- *
- * Return:   The compressed size of the frame pointed to by `src` or an error,
- *           which can be check with ZSTD_isError().
- *           Suitable to pass to ZSTD_decompress() or similar functions.
- */
-size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize);
-
-/*-*************************************
- * Decompressed size functions
- **************************************/
-/**
- * ZSTD_getFrameContentSize() - returns the content size in a zstd frame header
- * @src:     It should point to the start of a zstd encoded frame.
- * @srcSize: The size of the source buffer. It must be at least as large as the
- *           frame header. `ZSTD_frameHeaderSize_max` is always large enough.
- *
- * Return:   The frame content size stored in the frame header if known.
- *           `ZSTD_CONTENTSIZE_UNKNOWN` if the content size isn't stored in the
- *           frame header. `ZSTD_CONTENTSIZE_ERROR` on invalid input.
- */
-unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
-
-/**
- * ZSTD_findDecompressedSize() - returns decompressed size of a series of frames
- * @src:     It should point to the start of a series of zstd encoded and/or
- *           skippable frames.
- * @srcSize: The exact size of the series of frames.
- *
- * If any zstd encoded frame in the series doesn't have the frame content size
- * set, `ZSTD_CONTENTSIZE_UNKNOWN` is returned. But frame content size is always
- * set when using ZSTD_compress(). The decompressed size can be very large.
- * If the source is untrusted, the decompressed size could be wrong or
- * intentionally modified. Always ensure the result fits within the
- * application's authorized limits. ZSTD_findDecompressedSize() handles multiple
- * frames, and so it must traverse the input to read each frame header. This is
- * efficient as most of the data is skipped, however it does mean that all frame
- * data must be present and valid.
- *
- * Return:   Decompressed size of all the data contained in the frames if known.
- *           `ZSTD_CONTENTSIZE_UNKNOWN` if the decompressed size is unknown.
- *           `ZSTD_CONTENTSIZE_ERROR` if an error occurred.
- */
-unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize);
-
-/*-*************************************
- * Advanced compression functions
- **************************************/
-/**
- * ZSTD_checkCParams() - ensure parameter values remain within authorized range
- * @cParams: The zstd compression parameters.
- *
- * Return:   Zero or an error, which can be checked using ZSTD_isError().
- */
-size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams);
-
-/**
- * ZSTD_adjustCParams() - optimize parameters for a given srcSize and dictSize
- * @srcSize:  Optionally the estimated source size, or zero if unknown.
- * @dictSize: Optionally the estimated dictionary size, or zero if unknown.
- *
- * Return:    The optimized parameters.
- */
-ZSTD_compressionParameters ZSTD_adjustCParams(
-	ZSTD_compressionParameters cParams, unsigned long long srcSize,
-	size_t dictSize);
-
-/*--- Advanced decompression functions ---*/
-
-/**
- * ZSTD_isFrame() - returns true iff the buffer starts with a valid frame
- * @buffer: The source buffer to check.
- * @size:   The size of the source buffer, must be at least 4 bytes.
- *
- * Return: True iff the buffer starts with a zstd or skippable frame identifier.
- */
-unsigned int ZSTD_isFrame(const void *buffer, size_t size);
-
-/**
- * ZSTD_getDictID_fromDict() - returns the dictionary id stored in a dictionary
- * @dict:     The dictionary buffer.
- * @dictSize: The size of the dictionary buffer.
- *
- * Return:    The dictionary id stored within the dictionary or 0 if the
- *            dictionary is not a zstd dictionary. If it returns 0 the
- *            dictionary can still be loaded as a content-only dictionary.
+ * Return:   Returns 0 iff a frame is completely decoded and fully flushed.
+ *           Otherwise returns a hint for the number of bytes to use as the
+ *           input for the next function call or an error, which can be checked
+ *           using zstd_is_error(). The size hint will never load more than the
+ *           frame.
  */
-unsigned int ZSTD_getDictID_fromDict(const void *dict, size_t dictSize);
+size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output,
+	zstd_in_buffer *input);
 
-/**
- * ZSTD_getDictID_fromDDict() - returns the dictionary id stored in a ZSTD_DDict
- * @ddict: The ddict to find the id of.
- *
- * Return: The dictionary id stored within `ddict` or 0 if the dictionary is not
- *         a zstd dictionary. If it returns 0 `ddict` will be loaded as a
- *         content-only dictionary.
- */
-unsigned int ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict);
+/* ======   Frame Inspection Functions ====== */
 
 /**
- * ZSTD_getDictID_fromFrame() - returns the dictionary id stored in a zstd frame
- * @src:     Source buffer. It must be a zstd encoded frame.
- * @srcSize: The size of the source buffer. It must be at least as large as the
- *           frame header. `ZSTD_frameHeaderSize_max` is always large enough.
+ * zstd_find_frame_compressed_size() - returns the size of a compressed frame
+ * @src:      Source buffer. It should point to the start of a zstd encoded
+ *            frame or a skippable frame.
+ * @src_size: The size of the source buffer. It must be at least as large as the
+ *            size of the frame.
  *
- * Return:   The dictionary id required to decompress the frame stored within
- *           `src` or 0 if the dictionary id could not be decoded. It can return
- *           0 if the frame does not require a dictionary, the dictionary id
- *           wasn't stored in the frame, `src` is not a zstd frame, or `srcSize`
- *           is too small.
+ * Return:    The compressed size of the frame pointed to by `src` or an error,
+ *            which can be check with zstd_is_error().
+ *            Suitable to pass to ZSTD_decompress() or similar functions.
  */
-unsigned int ZSTD_getDictID_fromFrame(const void *src, size_t srcSize);
+size_t zstd_find_frame_compressed_size(const void *src, size_t src_size);
 
 /**
- * struct ZSTD_frameParams - zstd frame parameters stored in the frame header
+ * struct zstd_frame_params - zstd frame parameters stored in the frame header
  * @frameContentSize: The frame content size, or 0 if not present.
  * @windowSize:       The window size, or 0 if the frame is a skippable frame.
  * @dictID:           The dictionary id, or 0 if not present.
  * @checksumFlag:     Whether a checksum was used.
  */
-typedef struct {
-	unsigned long long frameContentSize;
-	unsigned int windowSize;
-	unsigned int dictID;
-	unsigned int checksumFlag;
-} ZSTD_frameParams;
+typedef ZSTD_frameParams zstd_frame_header;
 
 /**
- * ZSTD_getFrameParams() - extracts parameters from a zstd or skippable frame
- * @fparamsPtr: On success the frame parameters are written here.
- * @src:        The source buffer. It must point to a zstd or skippable frame.
- * @srcSize:    The size of the source buffer. `ZSTD_frameHeaderSize_max` is
- *              always large enough to succeed.
+ * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame
+ * @params:   On success the frame parameters are written here.
+ * @src:      The source buffer. It must point to a zstd or skippable frame.
+ * @src_size: The size of the source buffer.
  *
- * Return:      0 on success. If more data is required it returns how many bytes
- *              must be provided to make forward progress. Otherwise it returns
- *              an error, which can be checked using ZSTD_isError().
+ * Return:    0 on success. If more data is required it returns how many bytes
+ *            must be provided to make forward progress. Otherwise it returns
+ *            an error, which can be checked using zstd_is_error().
  */
-size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src,
-	size_t srcSize);
-
-/*-*****************************************************************************
- * Buffer-less and synchronous inner streaming functions
- *
- * This is an advanced API, giving full control over buffer management, for
- * users which need direct control over memory.
- * But it's also a complex one, with many restrictions (documented below).
- * Prefer using normal streaming API for an easier experience
- ******************************************************************************/
-
-/*-*****************************************************************************
- * Buffer-less streaming compression (synchronous mode)
- *
- * A ZSTD_CCtx object is required to track streaming operations.
- * Use ZSTD_initCCtx() to initialize a context.
- * ZSTD_CCtx object can be re-used multiple times within successive compression
- * operations.
- *
- * Start by initializing a context.
- * Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary
- * compression,
- * or ZSTD_compressBegin_advanced(), for finer parameter control.
- * It's also possible to duplicate a reference context which has already been
- * initialized, using ZSTD_copyCCtx()
- *
- * Then, consume your input using ZSTD_compressContinue().
- * There are some important considerations to keep in mind when using this
- * advanced function :
- * - ZSTD_compressContinue() has no internal buffer. It uses externally provided
- *   buffer only.
- * - Interface is synchronous : input is consumed entirely and produce 1+
- *   (or more) compressed blocks.
- * - Caller must ensure there is enough space in `dst` to store compressed data
- *   under worst case scenario. Worst case evaluation is provided by
- *   ZSTD_compressBound().
- *   ZSTD_compressContinue() doesn't guarantee recover after a failed
- *   compression.
- * - ZSTD_compressContinue() presumes prior input ***is still accessible and
- *   unmodified*** (up to maximum distance size, see WindowLog).
- *   It remembers all previous contiguous blocks, plus one separated memory
- *   segment (which can itself consists of multiple contiguous blocks)
- * - ZSTD_compressContinue() detects that prior input has been overwritten when
- *   `src` buffer overlaps. In which case, it will "discard" the relevant memory
- *   section from its history.
- *
- * Finish a frame with ZSTD_compressEnd(), which will write the last block(s)
- * and optional checksum. It's possible to use srcSize==0, in which case, it
- * will write a final empty block to end the frame. Without last block mark,
- * frames will be considered unfinished (corrupted) by decoders.
- *
- * `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new
- * frame.
- ******************************************************************************/
-
-/*=====   Buffer-less streaming compression functions  =====*/
-size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel);
-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict,
-	size_t dictSize, int compressionLevel);
-size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict,
-	size_t dictSize, ZSTD_parameters params,
-	unsigned long long pledgedSrcSize);
-size_t ZSTD_copyCCtx(ZSTD_CCtx *cctx, const ZSTD_CCtx *preparedCCtx,
-	unsigned long long pledgedSrcSize);
-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict,
-	unsigned long long pledgedSrcSize);
-size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-
-
-
-/*-*****************************************************************************
- * Buffer-less streaming decompression (synchronous mode)
- *
- * A ZSTD_DCtx object is required to track streaming operations.
- * Use ZSTD_initDCtx() to initialize a context.
- * A ZSTD_DCtx object can be re-used multiple times.
- *
- * First typical operation is to retrieve frame parameters, using
- * ZSTD_getFrameParams(). It fills a ZSTD_frameParams structure which provide
- * important information to correctly decode the frame, such as the minimum
- * rolling buffer size to allocate to decompress data (`windowSize`), and the
- * dictionary ID used.
- * Note: content size is optional, it may not be present. 0 means unknown.
- * Note that these values could be wrong, either because of data malformation,
- * or because an attacker is spoofing deliberate false information. As a
- * consequence, check that values remain within valid application range,
- * especially `windowSize`, before allocation. Each application can set its own
- * limit, depending on local restrictions. For extended interoperability, it is
- * recommended to support at least 8 MB.
- * Frame parameters are extracted from the beginning of the compressed frame.
- * Data fragment must be large enough to ensure successful decoding, typically
- * `ZSTD_frameHeaderSize_max` bytes.
- * Result: 0: successful decoding, the `ZSTD_frameParams` structure is filled.
- *        >0: `srcSize` is too small, provide at least this many bytes.
- *        errorCode, which can be tested using ZSTD_isError().
- *
- * Start decompression, with ZSTD_decompressBegin() or
- * ZSTD_decompressBegin_usingDict(). Alternatively, you can copy a prepared
- * context, using ZSTD_copyDCtx().
- *
- * Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue()
- * alternatively.
- * ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize'
- * to ZSTD_decompressContinue().
- * ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will
- * fail.
- *
- * The result of ZSTD_decompressContinue() is the number of bytes regenerated
- * within 'dst' (necessarily <= dstCapacity). It can be zero, which is not an
- * error; it just means ZSTD_decompressContinue() has decoded some metadata
- * item. It can also be an error code, which can be tested with ZSTD_isError().
- *
- * ZSTD_decompressContinue() needs previous data blocks during decompression, up
- * to `windowSize`. They should preferably be located contiguously, prior to
- * current block. Alternatively, a round buffer of sufficient size is also
- * possible. Sufficient size is determined by frame parameters.
- * ZSTD_decompressContinue() is very sensitive to contiguity, if 2 blocks don't
- * follow each other, make sure that either the compressor breaks contiguity at
- * the same place, or that previous contiguous segment is large enough to
- * properly handle maximum back-reference.
- *
- * A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
- * Context can then be reset to start a new decompression.
- *
- * Note: it's possible to know if next input to present is a header or a block,
- * using ZSTD_nextInputType(). This information is not required to properly
- * decode a frame.
- *
- * == Special case: skippable frames ==
- *
- * Skippable frames allow integration of user-defined data into a flow of
- * concatenated frames. Skippable frames will be ignored (skipped) by a
- * decompressor. The format of skippable frames is as follows:
- * a) Skippable frame ID - 4 Bytes, Little endian format, any value from
- *    0x184D2A50 to 0x184D2A5F
- * b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
- * c) Frame Content - any content (User Data) of length equal to Frame Size
- * For skippable frames ZSTD_decompressContinue() always returns 0.
- * For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0
- * what means that a frame is skippable.
- * Note: If fparamsPtr->frameContentSize==0, it is ambiguous: the frame might
- *       actually be a zstd encoded frame with no content. For purposes of
- *       decompression, it is valid in both cases to skip the frame using
- *       ZSTD_findFrameCompressedSize() to find its size in bytes.
- * It also returns frame size as fparamsPtr->frameContentSize.
- ******************************************************************************/
-
-/*=====   Buffer-less streaming decompression functions  =====*/
-size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx);
-size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict,
-	size_t dictSize);
-void   ZSTD_copyDCtx(ZSTD_DCtx *dctx, const ZSTD_DCtx *preparedDCtx);
-size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx);
-size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-typedef enum {
-	ZSTDnit_frameHeader,
-	ZSTDnit_blockHeader,
-	ZSTDnit_block,
-	ZSTDnit_lastBlock,
-	ZSTDnit_checksum,
-	ZSTDnit_skippableFrame
-} ZSTD_nextInputType_e;
-ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx);
-
-/*-*****************************************************************************
- * Block functions
- *
- * Block functions produce and decode raw zstd blocks, without frame metadata.
- * Frame metadata cost is typically ~18 bytes, which can be non-negligible for
- * very small blocks (< 100 bytes). User will have to take in charge required
- * information to regenerate data, such as compressed and content sizes.
- *
- * A few rules to respect:
- * - Compressing and decompressing require a context structure
- *   + Use ZSTD_initCCtx() and ZSTD_initDCtx()
- * - It is necessary to init context before starting
- *   + compression : ZSTD_compressBegin()
- *   + decompression : ZSTD_decompressBegin()
- *   + variants _usingDict() are also allowed
- *   + copyCCtx() and copyDCtx() work too
- * - Block size is limited, it must be <= ZSTD_getBlockSizeMax()
- *   + If you need to compress more, cut data into multiple blocks
- *   + Consider using the regular ZSTD_compress() instead, as frame metadata
- *     costs become negligible when source size is large.
- * - When a block is considered not compressible enough, ZSTD_compressBlock()
- *   result will be zero. In which case, nothing is produced into `dst`.
- *   + User must test for such outcome and deal directly with uncompressed data
- *   + ZSTD_decompressBlock() doesn't accept uncompressed data as input!!!
- *   + In case of multiple successive blocks, decoder must be informed of
- *     uncompressed block existence to follow proper history. Use
- *     ZSTD_insertBlock() in such a case.
- ******************************************************************************/
-
-/* Define for static allocation */
-#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024)
-/*=====   Raw zstd block functions  =====*/
-size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx);
-size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart,
-	size_t blockSize);
+size_t zstd_get_frame_header(zstd_frame_header *params, const void *src,
+	size_t src_size);
 
-#endif  /* ZSTD_H */
+#endif  /* LINUX_ZSTD_H */
diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
new file mode 100644
index 000000000000..13151c34f725
--- /dev/null
+++ b/include/linux/zstd_lib.h
@@ -0,0 +1,1157 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+#ifndef ZSTD_H
+#define ZSTD_H
+
+/* ======   Dependency   ======*/
+#include <linux/types.h>   /* size_t */
+
+
+/*-*****************************************************************************
+ * Introduction
+ *
+ * zstd, short for Zstandard, is a fast lossless compression algorithm,
+ * targeting real-time compression scenarios at zlib-level and better
+ * compression ratios. The zstd compression library provides in-memory
+ * compression and decompression functions. The library supports compression
+ * levels from 1 up to ZSTD_maxCLevel() which is 22. Levels >= 20, labeled
+ * ultra, should be used with caution, as they require more memory.
+ * Compression can be done in:
+ *  - a single step, reusing a context (described as Explicit memory management)
+ *  - unbounded multiple steps (described as Streaming compression)
+ * The compression ratio achievable on small data can be highly improved using
+ * compression with a dictionary in:
+ *  - a single step (described as Simple dictionary API)
+ *  - a single step, reusing a dictionary (described as Fast dictionary API)
+ ******************************************************************************/
+
+/*======  Helper functions  ======*/
+
+/**
+ * enum ZSTD_ErrorCode - zstd error codes
+ *
+ * Functions that return size_t can be checked for errors using ZSTD_isError()
+ * and the ZSTD_ErrorCode can be extracted using ZSTD_getErrorCode().
+ */
+typedef enum {
+	ZSTD_error_no_error,
+	ZSTD_error_GENERIC,
+	ZSTD_error_prefix_unknown,
+	ZSTD_error_version_unsupported,
+	ZSTD_error_parameter_unknown,
+	ZSTD_error_frameParameter_unsupported,
+	ZSTD_error_frameParameter_unsupportedBy32bits,
+	ZSTD_error_frameParameter_windowTooLarge,
+	ZSTD_error_compressionParameter_unsupported,
+	ZSTD_error_init_missing,
+	ZSTD_error_memory_allocation,
+	ZSTD_error_stage_wrong,
+	ZSTD_error_dstSize_tooSmall,
+	ZSTD_error_srcSize_wrong,
+	ZSTD_error_corruption_detected,
+	ZSTD_error_checksum_wrong,
+	ZSTD_error_tableLog_tooLarge,
+	ZSTD_error_maxSymbolValue_tooLarge,
+	ZSTD_error_maxSymbolValue_tooSmall,
+	ZSTD_error_dictionary_corrupted,
+	ZSTD_error_dictionary_wrong,
+	ZSTD_error_dictionaryCreation_failed,
+	ZSTD_error_maxCode
+} ZSTD_ErrorCode;
+
+/**
+ * ZSTD_maxCLevel() - maximum compression level available
+ *
+ * Return: Maximum compression level available.
+ */
+int ZSTD_maxCLevel(void);
+/**
+ * ZSTD_compressBound() - maximum compressed size in worst case scenario
+ * @srcSize: The size of the data to compress.
+ *
+ * Return:   The maximum compressed size in the worst case scenario.
+ */
+size_t ZSTD_compressBound(size_t srcSize);
+/**
+ * ZSTD_isError() - tells if a size_t function result is an error code
+ * @code:  The function result to check for error.
+ *
+ * Return: Non-zero iff the code is an error.
+ */
+static __attribute__((unused)) unsigned int ZSTD_isError(size_t code)
+{
+	return code > (size_t)-ZSTD_error_maxCode;
+}
+/**
+ * ZSTD_getErrorCode() - translates an error function result to a ZSTD_ErrorCode
+ * @functionResult: The result of a function for which ZSTD_isError() is true.
+ *
+ * Return:          The ZSTD_ErrorCode corresponding to the functionResult or 0
+ *                  if the functionResult isn't an error.
+ */
+static __attribute__((unused)) ZSTD_ErrorCode ZSTD_getErrorCode(
+	size_t functionResult)
+{
+	if (!ZSTD_isError(functionResult))
+		return (ZSTD_ErrorCode)0;
+	return (ZSTD_ErrorCode)(0 - functionResult);
+}
+
+/**
+ * enum ZSTD_strategy - zstd compression search strategy
+ *
+ * From faster to stronger.
+ */
+typedef enum {
+	ZSTD_fast,
+	ZSTD_dfast,
+	ZSTD_greedy,
+	ZSTD_lazy,
+	ZSTD_lazy2,
+	ZSTD_btlazy2,
+	ZSTD_btopt,
+	ZSTD_btopt2
+} ZSTD_strategy;
+
+/**
+ * struct ZSTD_compressionParameters - zstd compression parameters
+ * @windowLog:    Log of the largest match distance. Larger means more
+ *                compression, and more memory needed during decompression.
+ * @chainLog:     Fully searched segment. Larger means more compression, slower,
+ *                and more memory (useless for fast).
+ * @hashLog:      Dispatch table. Larger means more compression,
+ *                slower, and more memory.
+ * @searchLog:    Number of searches. Larger means more compression and slower.
+ * @searchLength: Match length searched. Larger means faster decompression,
+ *                sometimes less compression.
+ * @targetLength: Acceptable match size for optimal parser (only). Larger means
+ *                more compression, and slower.
+ * @strategy:     The zstd compression strategy.
+ */
+typedef struct {
+	unsigned int windowLog;
+	unsigned int chainLog;
+	unsigned int hashLog;
+	unsigned int searchLog;
+	unsigned int searchLength;
+	unsigned int targetLength;
+	ZSTD_strategy strategy;
+} ZSTD_compressionParameters;
+
+/**
+ * struct ZSTD_frameParameters - zstd frame parameters
+ * @contentSizeFlag: Controls whether content size will be present in the frame
+ *                   header (when known).
+ * @checksumFlag:    Controls whether a 32-bit checksum is generated at the end
+ *                   of the frame for error detection.
+ * @noDictIDFlag:    Controls whether dictID will be saved into the frame header
+ *                   when using dictionary compression.
+ *
+ * The default value is all fields set to 0.
+ */
+typedef struct {
+	unsigned int contentSizeFlag;
+	unsigned int checksumFlag;
+	unsigned int noDictIDFlag;
+} ZSTD_frameParameters;
+
+/**
+ * struct ZSTD_parameters - zstd parameters
+ * @cParams: The compression parameters.
+ * @fParams: The frame parameters.
+ */
+typedef struct {
+	ZSTD_compressionParameters cParams;
+	ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+/**
+ * ZSTD_getCParams() - returns ZSTD_compressionParameters for selected level
+ * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
+ * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
+ * @dictSize:         The dictionary size or 0 if a dictionary isn't being used.
+ *
+ * Return:            The selected ZSTD_compressionParameters.
+ */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel,
+	unsigned long long estimatedSrcSize, size_t dictSize);
+
+/**
+ * ZSTD_getParams() - returns ZSTD_parameters for selected level
+ * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
+ * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
+ * @dictSize:         The dictionary size or 0 if a dictionary isn't being used.
+ *
+ * The same as ZSTD_getCParams() except also selects the default frame
+ * parameters (all zero).
+ *
+ * Return:            The selected ZSTD_parameters.
+ */
+ZSTD_parameters ZSTD_getParams(int compressionLevel,
+	unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*-*************************************
+ * Explicit memory management
+ **************************************/
+
+/**
+ * ZSTD_CCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_CCtx
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * If multiple compression parameters might be used, the caller must call
+ * ZSTD_CCtxWorkspaceBound() for each set of parameters and use the maximum
+ * size.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           ZSTD_initCCtx().
+ */
+size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CCtx - the zstd compression context
+ *
+ * When compressing many times it is recommended to allocate a context just once
+ * and reuse it for each successive compression operation.
+ */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+/**
+ * ZSTD_initCCtx() - initialize a zstd compression context
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace. Use ZSTD_CCtxWorkspaceBound() to
+ *                 determine how large the workspace must be.
+ *
+ * Return:         A compression context emplaced into workspace.
+ */
+ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_compressCCtx() - compress src into dst
+ * @ctx:         The context. Must have been initialized with a workspace at
+ *               least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
+ * @dst:         The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:         The data to compress.
+ * @srcSize:     The size of the data to compress.
+ * @params:      The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Return:       The compressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, ZSTD_parameters params);
+
+/**
+ * ZSTD_DCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_DCtx
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ *         ZSTD_initDCtx().
+ */
+size_t ZSTD_DCtxWorkspaceBound(void);
+
+/**
+ * struct ZSTD_DCtx - the zstd decompression context
+ *
+ * When decompressing many times it is recommended to allocate a context just
+ * once and reuse it for each successive decompression operation.
+ */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+/**
+ * ZSTD_initDCtx() - initialize a zstd decompression context
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace. Use ZSTD_DCtxWorkspaceBound() to
+ *                 determine how large the workspace must be.
+ *
+ * Return:         A decompression context emplaced into workspace.
+ */
+ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_decompressDCtx() - decompress zstd compressed src into dst
+ * @ctx:         The decompression context.
+ * @dst:         The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ *               as the decompressed size. If the caller cannot upper bound the
+ *               decompressed size, then it's better to use the streaming API.
+ * @src:         The zstd compressed data to decompress. Multiple concatenated
+ *               frames and skippable frames are allowed.
+ * @srcSize:     The exact size of the data to decompress.
+ *
+ * Return:       The decompressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_decompressDCtx(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+
+/*-************************
+ * Simple dictionary API
+ **************************/
+
+/**
+ * ZSTD_compress_usingDict() - compress src into dst using a dictionary
+ * @ctx:         The context. Must have been initialized with a workspace at
+ *               least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
+ * @dst:         The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:         The data to compress.
+ * @srcSize:     The size of the data to compress.
+ * @dict:        The dictionary to use for compression.
+ * @dictSize:    The size of the dictionary.
+ * @params:      The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Compression using a predefined dictionary. The same dictionary must be used
+ * during decompression.
+ *
+ * Return:       The compressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, const void *dict, size_t dictSize,
+	ZSTD_parameters params);
+
+/**
+ * ZSTD_decompress_usingDict() - decompress src into dst using a dictionary
+ * @ctx:         The decompression context.
+ * @dst:         The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ *               as the decompressed size. If the caller cannot upper bound the
+ *               decompressed size, then it's better to use the streaming API.
+ * @src:         The zstd compressed data to decompress. Multiple concatenated
+ *               frames and skippable frames are allowed.
+ * @srcSize:     The exact size of the data to decompress.
+ * @dict:        The dictionary to use for decompression. The same dictionary
+ *               must've been used to compress the data.
+ * @dictSize:    The size of the dictionary.
+ *
+ * Return:       The decompressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, const void *dict, size_t dictSize);
+
+/*-**************************
+ * Fast dictionary API
+ ***************************/
+
+/**
+ * ZSTD_CDictWorkspaceBound() - memory needed to initialize a ZSTD_CDict
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           ZSTD_initCDict().
+ */
+size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CDict - a digested dictionary to be used for compression
+ */
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/**
+ * ZSTD_initCDict() - initialize a digested dictionary for compression
+ * @dictBuffer:    The dictionary to digest. The buffer is referenced by the
+ *                 ZSTD_CDict so it must outlive the returned ZSTD_CDict.
+ * @dictSize:      The size of the dictionary.
+ * @params:        The parameters to use for compression. See ZSTD_getParams().
+ * @workspace:     The workspace. It must outlive the returned ZSTD_CDict.
+ * @workspaceSize: The workspace size. Must be at least
+ *                 ZSTD_CDictWorkspaceBound(params.cParams).
+ *
+ * When compressing multiple messages / blocks with the same dictionary it is
+ * recommended to load it just once. The ZSTD_CDict merely references the
+ * dictBuffer, so it must outlive the returned ZSTD_CDict.
+ *
+ * Return:         The digested dictionary emplaced into workspace.
+ */
+ZSTD_CDict *ZSTD_initCDict(const void *dictBuffer, size_t dictSize,
+	ZSTD_parameters params, void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_compress_usingCDict() - compress src into dst using a ZSTD_CDict
+ * @ctx:         The context. Must have been initialized with a workspace at
+ *               least as large as ZSTD_CCtxWorkspaceBound(cParams) where
+ *               cParams are the compression parameters used to initialize the
+ *               cdict.
+ * @dst:         The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:         The data to compress.
+ * @srcSize:     The size of the data to compress.
+ * @cdict:       The digested dictionary to use for compression.
+ * @params:      The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Compression using a digested dictionary. The same dictionary must be used
+ * during decompression.
+ *
+ * Return:       The compressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, const ZSTD_CDict *cdict);
+
+
+/**
+ * ZSTD_DDictWorkspaceBound() - memory needed to initialize a ZSTD_DDict
+ *
+ * Return:  A lower bound on the size of the workspace that is passed to
+ *          ZSTD_initDDict().
+ */
+size_t ZSTD_DDictWorkspaceBound(void);
+
+/**
+ * struct ZSTD_DDict - a digested dictionary to be used for decompression
+ */
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/**
+ * ZSTD_initDDict() - initialize a digested dictionary for decompression
+ * @dictBuffer:    The dictionary to digest. The buffer is referenced by the
+ *                 ZSTD_DDict so it must outlive the returned ZSTD_DDict.
+ * @dictSize:      The size of the dictionary.
+ * @workspace:     The workspace. It must outlive the returned ZSTD_DDict.
+ * @workspaceSize: The workspace size. Must be at least
+ *                 ZSTD_DDictWorkspaceBound().
+ *
+ * When decompressing multiple messages / blocks with the same dictionary it is
+ * recommended to load it just once. The ZSTD_DDict merely references the
+ * dictBuffer, so it must outlive the returned ZSTD_DDict.
+ *
+ * Return:         The digested dictionary emplaced into workspace.
+ */
+ZSTD_DDict *ZSTD_initDDict(const void *dictBuffer, size_t dictSize,
+	void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_decompress_usingDDict() - decompress src into dst using a ZSTD_DDict
+ * @ctx:         The decompression context.
+ * @dst:         The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ *               as the decompressed size. If the caller cannot upper bound the
+ *               decompressed size, then it's better to use the streaming API.
+ * @src:         The zstd compressed data to decompress. Multiple concatenated
+ *               frames and skippable frames are allowed.
+ * @srcSize:     The exact size of the data to decompress.
+ * @ddict:       The digested dictionary to use for decompression. The same
+ *               dictionary must've been used to compress the data.
+ *
+ * Return:       The decompressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst,
+	size_t dstCapacity, const void *src, size_t srcSize,
+	const ZSTD_DDict *ddict);
+
+
+/*-**************************
+ * Streaming
+ ***************************/
+
+/**
+ * struct ZSTD_inBuffer - input buffer for streaming
+ * @src:  Start of the input buffer.
+ * @size: Size of the input buffer.
+ * @pos:  Position where reading stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ */
+typedef struct ZSTD_inBuffer_s {
+	const void *src;
+	size_t size;
+	size_t pos;
+} ZSTD_inBuffer;
+
+/**
+ * struct ZSTD_outBuffer - output buffer for streaming
+ * @dst:  Start of the output buffer.
+ * @size: Size of the output buffer.
+ * @pos:  Position where writing stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ */
+typedef struct ZSTD_outBuffer_s {
+	void *dst;
+	size_t size;
+	size_t pos;
+} ZSTD_outBuffer;
+
+
+
+/*-*****************************************************************************
+ * Streaming compression - HowTo
+ *
+ * A ZSTD_CStream object is required to track streaming operation.
+ * Use ZSTD_initCStream() to initialize a ZSTD_CStream object.
+ * ZSTD_CStream objects can be reused multiple times on consecutive compression
+ * operations. It is recommended to re-use ZSTD_CStream in situations where many
+ * streaming operations will be achieved consecutively. Use one separate
+ * ZSTD_CStream per thread for parallel execution.
+ *
+ * Use ZSTD_compressStream() repetitively to consume input stream.
+ * The function will automatically update both `pos` fields.
+ * Note that it may not consume the entire input, in which case `pos < size`,
+ * and it's up to the caller to present again remaining data.
+ * It returns a hint for the preferred number of bytes to use as an input for
+ * the next function call.
+ *
+ * At any moment, it's possible to flush whatever data remains within internal
+ * buffer, using ZSTD_flushStream(). `output->pos` will be updated. There might
+ * still be some content left within the internal buffer if `output->size` is
+ * too small. It returns the number of bytes left in the internal buffer and
+ * must be called until it returns 0.
+ *
+ * ZSTD_endStream() instructs to finish a frame. It will perform a flush and
+ * write frame epilogue. The epilogue is required for decoders to consider a
+ * frame completed. Similar to ZSTD_flushStream(), it may not be able to flush
+ * the full content if `output->size` is too small. In which case, call again
+ * ZSTD_endStream() to complete the flush. It returns the number of bytes left
+ * in the internal buffer and must be called until it returns 0.
+ ******************************************************************************/
+
+/**
+ * ZSTD_CStreamWorkspaceBound() - memory needed to initialize a ZSTD_CStream
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           ZSTD_initCStream() and ZSTD_initCStream_usingCDict().
+ */
+size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CStream - the zstd streaming compression context
+ */
+typedef struct ZSTD_CStream_s ZSTD_CStream;
+
+/*===== ZSTD_CStream management functions =====*/
+/**
+ * ZSTD_initCStream() - initialize a zstd streaming compression context
+ * @params:         The zstd compression parameters.
+ * @pledgedSrcSize: If params.fParams.contentSizeFlag == 1 then the caller must
+ *                  pass the source size (zero means empty source). Otherwise,
+ *                  the caller may optionally pass the source size, or zero if
+ *                  unknown.
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspaceSize:  The size of workspace.
+ *                  Use ZSTD_CStreamWorkspaceBound(params.cParams) to determine
+ *                  how large the workspace must be.
+ *
+ * Return:          The zstd streaming compression context.
+ */
+ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params,
+	unsigned long long pledgedSrcSize, void *workspace,
+	size_t workspaceSize);
+
+/**
+ * ZSTD_initCStream_usingCDict() - initialize a streaming compression context
+ * @cdict:          The digested dictionary to use for compression.
+ * @pledgedSrcSize: Optionally the source size, or zero if unknown.
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspaceSize:  The size of workspace. Call ZSTD_CStreamWorkspaceBound()
+ *                  with the cParams used to initialize the cdict to determine
+ *                  how large the workspace must be.
+ *
+ * Return:          The zstd streaming compression context.
+ */
+ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict,
+	unsigned long long pledgedSrcSize, void *workspace,
+	size_t workspaceSize);
+
+/*===== Streaming compression functions =====*/
+/**
+ * ZSTD_resetCStream() - reset the context using parameters from creation
+ * @zcs:            The zstd streaming compression context to reset.
+ * @pledgedSrcSize: Optionally the source size, or zero if unknown.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused. If `pledgedSrcSize` is non-zero the frame
+ * content size is always written into the frame header.
+ *
+ * Return:          Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize);
+/**
+ * ZSTD_compressStream() - streaming compress some of input into output
+ * @zcs:    The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ *          compressed data was written.
+ * @input:  Source buffer. `input->pos` is updated to indicate how much data was
+ *          read. Note that it may not consume the entire input, in which case
+ *          `input->pos < input->size`, and it's up to the caller to present
+ *          remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ *
+ * Return:  A hint for the number of bytes to use as the input for the next
+ *          function call or an error, which can be checked using
+ *          ZSTD_isError().
+ */
+size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output,
+	ZSTD_inBuffer *input);
+/**
+ * ZSTD_flushStream() - flush internal buffers into output
+ * @zcs:    The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ *          compressed data was written.
+ *
+ * ZSTD_flushStream() must be called until it returns 0, meaning all the data
+ * has been flushed. Since ZSTD_flushStream() causes a block to be ended,
+ * calling it too often will degrade the compression ratio.
+ *
+ * Return:  The number of bytes still present within internal buffers or an
+ *          error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
+/**
+ * ZSTD_endStream() - flush internal buffers into output and end the frame
+ * @zcs:    The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ *          compressed data was written.
+ *
+ * ZSTD_endStream() must be called until it returns 0, meaning all the data has
+ * been flushed and the frame epilogue has been written.
+ *
+ * Return:  The number of bytes still present within internal buffers or an
+ *          error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
+
+/**
+ * ZSTD_CStreamInSize() - recommended size for the input buffer
+ *
+ * Return: The recommended size for the input buffer.
+ */
+size_t ZSTD_CStreamInSize(void);
+/**
+ * ZSTD_CStreamOutSize() - recommended size for the output buffer
+ *
+ * When the output buffer is at least this large, it is guaranteed to be large
+ * enough to flush at least one complete compressed block.
+ *
+ * Return: The recommended size for the output buffer.
+ */
+size_t ZSTD_CStreamOutSize(void);
+
+
+
+/*-*****************************************************************************
+ * Streaming decompression - HowTo
+ *
+ * A ZSTD_DStream object is required to track streaming operations.
+ * Use ZSTD_initDStream() to initialize a ZSTD_DStream object.
+ * ZSTD_DStream objects can be re-used multiple times.
+ *
+ * Use ZSTD_decompressStream() repetitively to consume your input.
+ * The function will update both `pos` fields.
+ * If `input->pos < input->size`, some input has not been consumed.
+ * It's up to the caller to present again remaining data.
+ * If `output->pos < output->size`, decoder has flushed everything it could.
+ * Returns 0 iff a frame is completely decoded and fully flushed.
+ * Otherwise it returns a suggested next input size that will never load more
+ * than the current frame.
+ ******************************************************************************/
+
+/**
+ * ZSTD_DStreamWorkspaceBound() - memory needed to initialize a ZSTD_DStream
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ *
+ * Return:         A lower bound on the size of the workspace that is passed to
+ *                 ZSTD_initDStream() and ZSTD_initDStream_usingDDict().
+ */
+size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize);
+
+/**
+ * struct ZSTD_DStream - the zstd streaming decompression context
+ */
+typedef struct ZSTD_DStream_s ZSTD_DStream;
+/*===== ZSTD_DStream management functions =====*/
+/**
+ * ZSTD_initDStream() - initialize a zstd streaming decompression context
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace.
+ *                 Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
+ *                 how large the workspace must be.
+ *
+ * Return:         The zstd streaming decompression context.
+ */
+ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace,
+	size_t workspaceSize);
+/**
+ * ZSTD_initDStream_usingDDict() - initialize streaming decompression context
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ * @ddict:         The digested dictionary to use for decompression.
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace.
+ *                 Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
+ *                 how large the workspace must be.
+ *
+ * Return:         The zstd streaming decompression context.
+ */
+ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize,
+	const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize);
+
+/*===== Streaming decompression functions =====*/
+/**
+ * ZSTD_resetDStream() - reset the context using parameters from creation
+ * @zds:   The zstd streaming decompression context to reset.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused.
+ *
+ * Return: Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_resetDStream(ZSTD_DStream *zds);
+/**
+ * ZSTD_decompressStream() - streaming decompress some of input into output
+ * @zds:    The zstd streaming decompression context.
+ * @output: Destination buffer. `output.pos` is updated to indicate how much
+ *          decompressed data was written.
+ * @input:  Source buffer. `input.pos` is updated to indicate how much data was
+ *          read. Note that it may not consume the entire input, in which case
+ *          `input.pos < input.size`, and it's up to the caller to present
+ *          remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ * ZSTD_decompressStream() will not consume the last byte of the frame until
+ * the entire frame is flushed.
+ *
+ * Return:  Returns 0 iff a frame is completely decoded and fully flushed.
+ *          Otherwise returns a hint for the number of bytes to use as the input
+ *          for the next function call or an error, which can be checked using
+ *          ZSTD_isError(). The size hint will never load more than the frame.
+ */
+size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output,
+	ZSTD_inBuffer *input);
+
+/**
+ * ZSTD_DStreamInSize() - recommended size for the input buffer
+ *
+ * Return: The recommended size for the input buffer.
+ */
+size_t ZSTD_DStreamInSize(void);
+/**
+ * ZSTD_DStreamOutSize() - recommended size for the output buffer
+ *
+ * When the output buffer is at least this large, it is guaranteed to be large
+ * enough to flush at least one complete decompressed block.
+ *
+ * Return: The recommended size for the output buffer.
+ */
+size_t ZSTD_DStreamOutSize(void);
+
+
+/* --- Constants ---*/
+#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
+
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+
+#define ZSTD_WINDOWLOG_MAX_32  27
+#define ZSTD_WINDOWLOG_MAX_64  27
+#define ZSTD_WINDOWLOG_MAX \
+	((unsigned int)(sizeof(size_t) == 4 \
+		? ZSTD_WINDOWLOG_MAX_32 \
+		: ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN 10
+#define ZSTD_HASHLOG_MAX ZSTD_WINDOWLOG_MAX
+#define ZSTD_HASHLOG_MIN        6
+#define ZSTD_CHAINLOG_MAX     (ZSTD_WINDOWLOG_MAX+1)
+#define ZSTD_CHAINLOG_MIN      ZSTD_HASHLOG_MIN
+#define ZSTD_HASHLOG3_MAX      17
+#define ZSTD_SEARCHLOG_MAX    (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN      1
+/* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_SEARCHLENGTH_MAX   7
+/* only for ZSTD_btopt, other strategies are limited to 4 */
+#define ZSTD_SEARCHLENGTH_MIN   3
+#define ZSTD_TARGETLENGTH_MIN   4
+#define ZSTD_TARGETLENGTH_MAX 999
+
+/* for static allocation */
+#define ZSTD_FRAMEHEADERSIZE_MAX 18
+#define ZSTD_FRAMEHEADERSIZE_MIN  6
+#define ZSTD_frameHeaderSize_prefix 5
+#define ZSTD_frameHeaderSize_min ZSTD_FRAMEHEADERSIZE_MIN
+#define ZSTD_frameHeaderSize_max ZSTD_FRAMEHEADERSIZE_MAX
+/* magic number + skippable frame length */
+#define ZSTD_skippableHeaderSize 8
+
+
+/*-*************************************
+ * Compressed size functions
+ **************************************/
+
+/**
+ * ZSTD_findFrameCompressedSize() - returns the size of a compressed frame
+ * @src:     Source buffer. It should point to the start of a zstd encoded frame
+ *           or a skippable frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ *           size of the frame.
+ *
+ * Return:   The compressed size of the frame pointed to by `src` or an error,
+ *           which can be check with ZSTD_isError().
+ *           Suitable to pass to ZSTD_decompress() or similar functions.
+ */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize);
+
+/*-*************************************
+ * Decompressed size functions
+ **************************************/
+/**
+ * ZSTD_getFrameContentSize() - returns the content size in a zstd frame header
+ * @src:     It should point to the start of a zstd encoded frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ *           frame header. `ZSTD_frameHeaderSize_max` is always large enough.
+ *
+ * Return:   The frame content size stored in the frame header if known.
+ *           `ZSTD_CONTENTSIZE_UNKNOWN` if the content size isn't stored in the
+ *           frame header. `ZSTD_CONTENTSIZE_ERROR` on invalid input.
+ */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/**
+ * ZSTD_findDecompressedSize() - returns decompressed size of a series of frames
+ * @src:     It should point to the start of a series of zstd encoded and/or
+ *           skippable frames.
+ * @srcSize: The exact size of the series of frames.
+ *
+ * If any zstd encoded frame in the series doesn't have the frame content size
+ * set, `ZSTD_CONTENTSIZE_UNKNOWN` is returned. But frame content size is always
+ * set when using ZSTD_compress(). The decompressed size can be very large.
+ * If the source is untrusted, the decompressed size could be wrong or
+ * intentionally modified. Always ensure the result fits within the
+ * application's authorized limits. ZSTD_findDecompressedSize() handles multiple
+ * frames, and so it must traverse the input to read each frame header. This is
+ * efficient as most of the data is skipped, however it does mean that all frame
+ * data must be present and valid.
+ *
+ * Return:   Decompressed size of all the data contained in the frames if known.
+ *           `ZSTD_CONTENTSIZE_UNKNOWN` if the decompressed size is unknown.
+ *           `ZSTD_CONTENTSIZE_ERROR` if an error occurred.
+ */
+unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize);
+
+/*-*************************************
+ * Advanced compression functions
+ **************************************/
+/**
+ * ZSTD_checkCParams() - ensure parameter values remain within authorized range
+ * @cParams: The zstd compression parameters.
+ *
+ * Return:   Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams);
+
+/**
+ * ZSTD_adjustCParams() - optimize parameters for a given srcSize and dictSize
+ * @srcSize:  Optionally the estimated source size, or zero if unknown.
+ * @dictSize: Optionally the estimated dictionary size, or zero if unknown.
+ *
+ * Return:    The optimized parameters.
+ */
+ZSTD_compressionParameters ZSTD_adjustCParams(
+	ZSTD_compressionParameters cParams, unsigned long long srcSize,
+	size_t dictSize);
+
+/*--- Advanced decompression functions ---*/
+
+/**
+ * ZSTD_isFrame() - returns true iff the buffer starts with a valid frame
+ * @buffer: The source buffer to check.
+ * @size:   The size of the source buffer, must be at least 4 bytes.
+ *
+ * Return: True iff the buffer starts with a zstd or skippable frame identifier.
+ */
+unsigned int ZSTD_isFrame(const void *buffer, size_t size);
+
+/**
+ * ZSTD_getDictID_fromDict() - returns the dictionary id stored in a dictionary
+ * @dict:     The dictionary buffer.
+ * @dictSize: The size of the dictionary buffer.
+ *
+ * Return:    The dictionary id stored within the dictionary or 0 if the
+ *            dictionary is not a zstd dictionary. If it returns 0 the
+ *            dictionary can still be loaded as a content-only dictionary.
+ */
+unsigned int ZSTD_getDictID_fromDict(const void *dict, size_t dictSize);
+
+/**
+ * ZSTD_getDictID_fromDDict() - returns the dictionary id stored in a ZSTD_DDict
+ * @ddict: The ddict to find the id of.
+ *
+ * Return: The dictionary id stored within `ddict` or 0 if the dictionary is not
+ *         a zstd dictionary. If it returns 0 `ddict` will be loaded as a
+ *         content-only dictionary.
+ */
+unsigned int ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict);
+
+/**
+ * ZSTD_getDictID_fromFrame() - returns the dictionary id stored in a zstd frame
+ * @src:     Source buffer. It must be a zstd encoded frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ *           frame header. `ZSTD_frameHeaderSize_max` is always large enough.
+ *
+ * Return:   The dictionary id required to decompress the frame stored within
+ *           `src` or 0 if the dictionary id could not be decoded. It can return
+ *           0 if the frame does not require a dictionary, the dictionary id
+ *           wasn't stored in the frame, `src` is not a zstd frame, or `srcSize`
+ *           is too small.
+ */
+unsigned int ZSTD_getDictID_fromFrame(const void *src, size_t srcSize);
+
+/**
+ * struct ZSTD_frameParams - zstd frame parameters stored in the frame header
+ * @frameContentSize: The frame content size, or 0 if not present.
+ * @windowSize:       The window size, or 0 if the frame is a skippable frame.
+ * @dictID:           The dictionary id, or 0 if not present.
+ * @checksumFlag:     Whether a checksum was used.
+ */
+typedef struct {
+	unsigned long long frameContentSize;
+	unsigned int windowSize;
+	unsigned int dictID;
+	unsigned int checksumFlag;
+} ZSTD_frameParams;
+
+/**
+ * ZSTD_getFrameParams() - extracts parameters from a zstd or skippable frame
+ * @fparamsPtr: On success the frame parameters are written here.
+ * @src:        The source buffer. It must point to a zstd or skippable frame.
+ * @srcSize:    The size of the source buffer. `ZSTD_frameHeaderSize_max` is
+ *              always large enough to succeed.
+ *
+ * Return:      0 on success. If more data is required it returns how many bytes
+ *              must be provided to make forward progress. Otherwise it returns
+ *              an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src,
+	size_t srcSize);
+
+/*-*****************************************************************************
+ * Buffer-less and synchronous inner streaming functions
+ *
+ * This is an advanced API, giving full control over buffer management, for
+ * users which need direct control over memory.
+ * But it's also a complex one, with many restrictions (documented below).
+ * Prefer using normal streaming API for an easier experience
+ ******************************************************************************/
+
+/*-*****************************************************************************
+ * Buffer-less streaming compression (synchronous mode)
+ *
+ * A ZSTD_CCtx object is required to track streaming operations.
+ * Use ZSTD_initCCtx() to initialize a context.
+ * ZSTD_CCtx object can be re-used multiple times within successive compression
+ * operations.
+ *
+ * Start by initializing a context.
+ * Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary
+ * compression,
+ * or ZSTD_compressBegin_advanced(), for finer parameter control.
+ * It's also possible to duplicate a reference context which has already been
+ * initialized, using ZSTD_copyCCtx()
+ *
+ * Then, consume your input using ZSTD_compressContinue().
+ * There are some important considerations to keep in mind when using this
+ * advanced function :
+ * - ZSTD_compressContinue() has no internal buffer. It uses externally provided
+ *   buffer only.
+ * - Interface is synchronous : input is consumed entirely and produce 1+
+ *   (or more) compressed blocks.
+ * - Caller must ensure there is enough space in `dst` to store compressed data
+ *   under worst case scenario. Worst case evaluation is provided by
+ *   ZSTD_compressBound().
+ *   ZSTD_compressContinue() doesn't guarantee recover after a failed
+ *   compression.
+ * - ZSTD_compressContinue() presumes prior input ***is still accessible and
+ *   unmodified*** (up to maximum distance size, see WindowLog).
+ *   It remembers all previous contiguous blocks, plus one separated memory
+ *   segment (which can itself consists of multiple contiguous blocks)
+ * - ZSTD_compressContinue() detects that prior input has been overwritten when
+ *   `src` buffer overlaps. In which case, it will "discard" the relevant memory
+ *   section from its history.
+ *
+ * Finish a frame with ZSTD_compressEnd(), which will write the last block(s)
+ * and optional checksum. It's possible to use srcSize==0, in which case, it
+ * will write a final empty block to end the frame. Without last block mark,
+ * frames will be considered unfinished (corrupted) by decoders.
+ *
+ * `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new
+ * frame.
+ ******************************************************************************/
+
+/*=====   Buffer-less streaming compression functions  =====*/
+size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel);
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict,
+	size_t dictSize, int compressionLevel);
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict,
+	size_t dictSize, ZSTD_parameters params,
+	unsigned long long pledgedSrcSize);
+size_t ZSTD_copyCCtx(ZSTD_CCtx *cctx, const ZSTD_CCtx *preparedCCtx,
+	unsigned long long pledgedSrcSize);
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict,
+	unsigned long long pledgedSrcSize);
+size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+
+
+
+/*-*****************************************************************************
+ * Buffer-less streaming decompression (synchronous mode)
+ *
+ * A ZSTD_DCtx object is required to track streaming operations.
+ * Use ZSTD_initDCtx() to initialize a context.
+ * A ZSTD_DCtx object can be re-used multiple times.
+ *
+ * First typical operation is to retrieve frame parameters, using
+ * ZSTD_getFrameParams(). It fills a ZSTD_frameParams structure which provide
+ * important information to correctly decode the frame, such as the minimum
+ * rolling buffer size to allocate to decompress data (`windowSize`), and the
+ * dictionary ID used.
+ * Note: content size is optional, it may not be present. 0 means unknown.
+ * Note that these values could be wrong, either because of data malformation,
+ * or because an attacker is spoofing deliberate false information. As a
+ * consequence, check that values remain within valid application range,
+ * especially `windowSize`, before allocation. Each application can set its own
+ * limit, depending on local restrictions. For extended interoperability, it is
+ * recommended to support at least 8 MB.
+ * Frame parameters are extracted from the beginning of the compressed frame.
+ * Data fragment must be large enough to ensure successful decoding, typically
+ * `ZSTD_frameHeaderSize_max` bytes.
+ * Result: 0: successful decoding, the `ZSTD_frameParams` structure is filled.
+ *        >0: `srcSize` is too small, provide at least this many bytes.
+ *        errorCode, which can be tested using ZSTD_isError().
+ *
+ * Start decompression, with ZSTD_decompressBegin() or
+ * ZSTD_decompressBegin_usingDict(). Alternatively, you can copy a prepared
+ * context, using ZSTD_copyDCtx().
+ *
+ * Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue()
+ * alternatively.
+ * ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize'
+ * to ZSTD_decompressContinue().
+ * ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will
+ * fail.
+ *
+ * The result of ZSTD_decompressContinue() is the number of bytes regenerated
+ * within 'dst' (necessarily <= dstCapacity). It can be zero, which is not an
+ * error; it just means ZSTD_decompressContinue() has decoded some metadata
+ * item. It can also be an error code, which can be tested with ZSTD_isError().
+ *
+ * ZSTD_decompressContinue() needs previous data blocks during decompression, up
+ * to `windowSize`. They should preferably be located contiguously, prior to
+ * current block. Alternatively, a round buffer of sufficient size is also
+ * possible. Sufficient size is determined by frame parameters.
+ * ZSTD_decompressContinue() is very sensitive to contiguity, if 2 blocks don't
+ * follow each other, make sure that either the compressor breaks contiguity at
+ * the same place, or that previous contiguous segment is large enough to
+ * properly handle maximum back-reference.
+ *
+ * A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+ * Context can then be reset to start a new decompression.
+ *
+ * Note: it's possible to know if next input to present is a header or a block,
+ * using ZSTD_nextInputType(). This information is not required to properly
+ * decode a frame.
+ *
+ * == Special case: skippable frames ==
+ *
+ * Skippable frames allow integration of user-defined data into a flow of
+ * concatenated frames. Skippable frames will be ignored (skipped) by a
+ * decompressor. The format of skippable frames is as follows:
+ * a) Skippable frame ID - 4 Bytes, Little endian format, any value from
+ *    0x184D2A50 to 0x184D2A5F
+ * b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+ * c) Frame Content - any content (User Data) of length equal to Frame Size
+ * For skippable frames ZSTD_decompressContinue() always returns 0.
+ * For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0
+ * what means that a frame is skippable.
+ * Note: If fparamsPtr->frameContentSize==0, it is ambiguous: the frame might
+ *       actually be a zstd encoded frame with no content. For purposes of
+ *       decompression, it is valid in both cases to skip the frame using
+ *       ZSTD_findFrameCompressedSize() to find its size in bytes.
+ * It also returns frame size as fparamsPtr->frameContentSize.
+ ******************************************************************************/
+
+/*=====   Buffer-less streaming decompression functions  =====*/
+size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx);
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict,
+	size_t dictSize);
+void   ZSTD_copyDCtx(ZSTD_DCtx *dctx, const ZSTD_DCtx *preparedDCtx);
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx);
+size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+typedef enum {
+	ZSTDnit_frameHeader,
+	ZSTDnit_blockHeader,
+	ZSTDnit_block,
+	ZSTDnit_lastBlock,
+	ZSTDnit_checksum,
+	ZSTDnit_skippableFrame
+} ZSTD_nextInputType_e;
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx);
+
+/*-*****************************************************************************
+ * Block functions
+ *
+ * Block functions produce and decode raw zstd blocks, without frame metadata.
+ * Frame metadata cost is typically ~18 bytes, which can be non-negligible for
+ * very small blocks (< 100 bytes). User will have to take in charge required
+ * information to regenerate data, such as compressed and content sizes.
+ *
+ * A few rules to respect:
+ * - Compressing and decompressing require a context structure
+ *   + Use ZSTD_initCCtx() and ZSTD_initDCtx()
+ * - It is necessary to init context before starting
+ *   + compression : ZSTD_compressBegin()
+ *   + decompression : ZSTD_decompressBegin()
+ *   + variants _usingDict() are also allowed
+ *   + copyCCtx() and copyDCtx() work too
+ * - Block size is limited, it must be <= ZSTD_getBlockSizeMax()
+ *   + If you need to compress more, cut data into multiple blocks
+ *   + Consider using the regular ZSTD_compress() instead, as frame metadata
+ *     costs become negligible when source size is large.
+ * - When a block is considered not compressible enough, ZSTD_compressBlock()
+ *   result will be zero. In which case, nothing is produced into `dst`.
+ *   + User must test for such outcome and deal directly with uncompressed data
+ *   + ZSTD_decompressBlock() doesn't accept uncompressed data as input!!!
+ *   + In case of multiple successive blocks, decoder must be informed of
+ *     uncompressed block existence to follow proper history. Use
+ *     ZSTD_insertBlock() in such a case.
+ ******************************************************************************/
+
+/* Define for static allocation */
+#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024)
+/*=====   Raw zstd block functions  =====*/
+size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx);
+size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart,
+	size_t blockSize);
+
+#endif  /* ZSTD_H */
-- 
cgit v1.2.3


From e0c1b49f5b674cca7b10549c53b3791d0bbc90a8 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Fri, 11 Sep 2020 16:37:08 -0700
Subject: lib: zstd: Upgrade to latest upstream zstd version 1.4.10

Upgrade to the latest upstream zstd version 1.4.10.

This patch is 100% generated from upstream zstd commit 20821a46f412 [0].

This patch is very large because it is transitioning from the custom
kernel zstd to using upstream directly. The new zstd follows upstreams
file structure which is different. Future update patches will be much
smaller because they will only contain the changes from one upstream
zstd release.

As an aid for review I've created a commit [1] that shows the diff
between upstream zstd as-is (which doesn't compile), and the zstd
code imported in this patch. The verion of zstd in this patch is
generated from upstream with changes applied by automation to replace
upstreams libc dependencies, remove unnecessary portability macros,
replace `/**` comments with `/*` comments, and use the kernel's xxhash
instead of bundling it.

The benefits of this patch are as follows:
1. Using upstream directly with automated script to generate kernel
   code. This allows us to update the kernel every upstream release, so
   the kernel gets the latest bug fixes and performance improvements,
   and doesn't get 3 years out of date again. The automation and the
   translated code are tested every upstream commit to ensure it
   continues to work.
2. Upgrades from a custom zstd based on 1.3.1 to 1.4.10, getting 3 years
   of performance improvements and bug fixes. On x86_64 I've measured
   15% faster BtrFS and SquashFS decompression+read speeds, 35% faster
   kernel decompression, and 30% faster ZRAM decompression+read speeds.
3. Zstd-1.4.10 supports negative compression levels, which allow zstd to
   match or subsume lzo's performance.
4. Maintains the same kernel-specific wrapper API, so no callers have to
   be modified with zstd version updates.

One concern that was brought up was stack usage. Upstream zstd had
already removed most of its heavy stack usage functions, but I just
removed the last functions that allocate arrays on the stack. I've
measured the high water mark for both compression and decompression
before and after this patch. Decompression is approximately neutral,
using about 1.2KB of stack space. Compression levels up to 3 regressed
from 1.4KB -> 1.6KB, and higher compression levels regressed from 1.5KB
-> 2KB. We've added unit tests upstream to prevent further regression.
I believe that this is a reasonable increase, and if it does end up
causing problems, this commit can be cleanly reverted, because it only
touches zstd.

I chose the bulk update instead of replaying upstream commits because
there have been ~3500 upstream commits since the 1.3.1 release, zstd
wasn't ready to be used in the kernel as-is before a month ago, and not
all upstream zstd commits build. The bulk update preserves bisectablity
because bugs can be bisected to the zstd version update. At that point
the update can be reverted, and we can work with upstream to find and
fix the bug.

Note that upstream zstd release 1.4.10 doesn't exist yet. I have cut a
staging branch at 20821a46f412 [0] and will apply any changes requested
to the staging branch. Once we're ready to merge this update I will cut
a zstd release at the commit we merge, so we have a known zstd release
in the kernel.

The implementation of the kernel API is contained in
zstd_compress_module.c and zstd_decompress_module.c.

[0] https://github.com/facebook/zstd/commit/20821a46f4122f9abd7c7b245d28162dde8129c9
[1] https://github.com/terrelln/linux/commit/e0fa481d0e3df26918da0a13749740a1f6777574

Signed-off-by: Nick Terrell <terrelln@fb.com>
Tested By: Paul Jones <paul@pauljones.id.au>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com> # LLVM/Clang v13.0.0 on x86-64
Tested-by: Jean-Denis Girard <jd.girard@sysnux.pf>
---
 include/linux/zstd.h        |   13 +-
 include/linux/zstd_errors.h |   77 +
 include/linux/zstd_lib.h    | 3367 +++++++++++++++++++++++++++++--------------
 3 files changed, 2408 insertions(+), 1049 deletions(-)
 create mode 100644 include/linux/zstd_errors.h

(limited to 'include/linux')

diff --git a/include/linux/zstd.h b/include/linux/zstd.h
index 9fbc7729b0a0..113408eef6ec 100644
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
  * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
@@ -22,6 +22,7 @@
 
 /* ======   Dependency   ====== */
 #include <linux/types.h>
+#include <linux/zstd_errors.h>
 #include <linux/zstd_lib.h>
 
 /* ======   Helper Functions   ====== */
@@ -417,12 +418,18 @@ size_t zstd_find_frame_compressed_size(const void *src, size_t src_size);
 
 /**
  * struct zstd_frame_params - zstd frame parameters stored in the frame header
- * @frameContentSize: The frame content size, or 0 if not present.
+ * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not
+ *                    present.
  * @windowSize:       The window size, or 0 if the frame is a skippable frame.
+ * @blockSizeMax:     The maximum block size.
+ * @frameType:        The frame type (zstd or skippable)
+ * @headerSize:       The size of the frame header.
  * @dictID:           The dictionary id, or 0 if not present.
  * @checksumFlag:     Whether a checksum was used.
+ *
+ * See zstd_lib.h.
  */
-typedef ZSTD_frameParams zstd_frame_header;
+typedef ZSTD_frameHeader zstd_frame_header;
 
 /**
  * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame
diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
new file mode 100644
index 000000000000..58b6dd45a969
--- /dev/null
+++ b/include/linux/zstd_errors.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+
+/*===== dependency =====*/
+#include <linux/types.h>   /* size_t */
+
+
+/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+#define ZSTDERRORLIB_VISIBILITY 
+#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+
+/*-*********************************************
+ *  Error codes list
+ *-*********************************************
+ *  Error codes _values_ are pinned down since v1.3.1 only.
+ *  Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ *  Only values < 100 are considered stable.
+ *
+ *  note 1 : this API shall be used with static linking only.
+ *           dynamic linking is not yet officially supported.
+ *  note 2 : Prefer relying on the enum than on its value whenever possible
+ *           This is the only supported way to use the error list < v1.3.1
+ *  note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+  ZSTD_error_no_error = 0,
+  ZSTD_error_GENERIC  = 1,
+  ZSTD_error_prefix_unknown                = 10,
+  ZSTD_error_version_unsupported           = 12,
+  ZSTD_error_frameParameter_unsupported    = 14,
+  ZSTD_error_frameParameter_windowTooLarge = 16,
+  ZSTD_error_corruption_detected = 20,
+  ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_dictionary_corrupted      = 30,
+  ZSTD_error_dictionary_wrong          = 32,
+  ZSTD_error_dictionaryCreation_failed = 34,
+  ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_outOfBound    = 42,
+  ZSTD_error_tableLog_tooLarge       = 44,
+  ZSTD_error_maxSymbolValue_tooLarge = 46,
+  ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stage_wrong       = 60,
+  ZSTD_error_init_missing      = 62,
+  ZSTD_error_memory_allocation = 64,
+  ZSTD_error_workSpace_tooSmall= 66,
+  ZSTD_error_dstSize_tooSmall = 70,
+  ZSTD_error_srcSize_wrong    = 72,
+  ZSTD_error_dstBuffer_null   = 74,
+  /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+  ZSTD_error_frameIndex_tooLarge = 100,
+  ZSTD_error_seekableIO          = 102,
+  ZSTD_error_dstBuffer_wrong     = 104,
+  ZSTD_error_srcBuffer_wrong     = 105,
+  ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+    which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+
+#endif /* ZSTD_ERRORS_H_398273423 */
diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
index 13151c34f725..b8c7dbf98390 100644
--- a/include/linux/zstd_lib.h
+++ b/include/linux/zstd_lib.h
@@ -2,1156 +2,2431 @@
  * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of https://github.com/facebook/zstd.
- * An additional grant of patent rights can be found in the PATENTS file in the
- * same directory.
- *
- * This program is free software; you can redistribute it and/or modify it under
- * the terms of the GNU General Public License version 2 as published by the
- * Free Software Foundation. This program is dual-licensed; you may select
- * either version 2 of the GNU General Public License ("GPL") or BSD license
- * ("BSD").
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
  */
 
-#ifndef ZSTD_H
-#define ZSTD_H
+#ifndef ZSTD_H_235446
+#define ZSTD_H_235446
 
 /* ======   Dependency   ======*/
+#include <linux/limits.h>   /* INT_MAX */
 #include <linux/types.h>   /* size_t */
 
 
-/*-*****************************************************************************
- * Introduction
- *
- * zstd, short for Zstandard, is a fast lossless compression algorithm,
- * targeting real-time compression scenarios at zlib-level and better
- * compression ratios. The zstd compression library provides in-memory
- * compression and decompression functions. The library supports compression
- * levels from 1 up to ZSTD_maxCLevel() which is 22. Levels >= 20, labeled
- * ultra, should be used with caution, as they require more memory.
- * Compression can be done in:
- *  - a single step, reusing a context (described as Explicit memory management)
- *  - unbounded multiple steps (described as Streaming compression)
- * The compression ratio achievable on small data can be highly improved using
- * compression with a dictionary in:
- *  - a single step (described as Simple dictionary API)
- *  - a single step, reusing a dictionary (described as Fast dictionary API)
- ******************************************************************************/
+/* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+#define ZSTDLIB_VISIBILITY 
+#define ZSTDLIB_API ZSTDLIB_VISIBILITY
+
+
+/* *****************************************************************************
+  Introduction
+
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
+  Compression can be done in:
+    - a single step (described as Simple API)
+    - a single step, reusing a context (described as Explicit context)
+    - unbounded multiple steps (described as Streaming compression)
+
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
+*******************************************************************************/
+
+/*------   Version   ------*/
+#define ZSTD_VERSION_MAJOR    1
+#define ZSTD_VERSION_MINOR    4
+#define ZSTD_VERSION_RELEASE  10
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+
+/*! ZSTD_versionNumber() :
+ *  Return runtime library version, the value is (MAJOR*100*100 + MINOR*100 + RELEASE). */
+ZSTDLIB_API unsigned ZSTD_versionNumber(void);
+
+#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
+#define ZSTD_QUOTE(str) #str
+#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
+#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
+
+/*! ZSTD_versionString() :
+ *  Return runtime library version, like "1.4.5". Requires v1.3.0+. */
+ZSTDLIB_API const char* ZSTD_versionString(void);
+
+/* *************************************
+ *  Default constant
+ ***************************************/
+#ifndef ZSTD_CLEVEL_DEFAULT
+#  define ZSTD_CLEVEL_DEFAULT 3
+#endif
+
+/* *************************************
+ *  Constants
+ ***************************************/
+
+/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */
+#define ZSTD_MAGICNUMBER            0xFD2FB528    /* valid since v0.8.0 */
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437    /* valid since v0.7.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50    /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */
+#define ZSTD_MAGIC_SKIPPABLE_MASK   0xFFFFFFF0
+
+#define ZSTD_BLOCKSIZELOG_MAX  17
+#define ZSTD_BLOCKSIZE_MAX     (1<<ZSTD_BLOCKSIZELOG_MAX)
+
+
+
+/* *************************************
+*  Simple API
+***************************************/
+/*! ZSTD_compress() :
+ *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  @return : compressed size written into `dst` (<= `dstCapacity),
+ *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                                  int compressionLevel);
+
+/*! ZSTD_decompress() :
+ *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+ *  `dstCapacity` is an upper bound of originalSize to regenerate.
+ *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                              const void* src, size_t compressedSize);
+
+/*! ZSTD_getFrameContentSize() : requires v1.3.0+
+ *  `src` should point to the start of a ZSTD encoded frame.
+ *  `srcSize` must be at least as large as the frame header.
+ *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+ *  @return : - decompressed size of `src` frame content, if known
+ *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+ *   note 1 : a 0 return value means the frame is valid but "empty".
+ *   note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *            Optionally, application can rely on some implicit limit,
+ *            as ZSTD_decompress() only needs an upper bound of decompressed size.
+ *            (For example, data could be necessarily cut into blocks <= 16 KB).
+ *   note 3 : decompressed size is always present when compression is completed using single-pass functions,
+ *            such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
+ *   note 4 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure return value fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 6 : This function replaces ZSTD_getDecompressedSize() */
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_getDecompressedSize() :
+ *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+ *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+ *  "empty", "unknown" and "error" results to the same return value (0),
+ *  while ZSTD_getFrameContentSize() gives them separate return values.
+ * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_findFrameCompressedSize() :
+ * `src` should point to the start of a ZSTD frame or skippable frame.
+ * `srcSize` must be >= first frame size
+ * @return : the compressed size of the first frame starting at `src`,
+ *           suitable to pass as `srcSize` to `ZSTD_decompress` or similar,
+ *        or an error code if input is invalid */
+ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+
 
 /*======  Helper functions  ======*/
+#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed */
+ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+
+
+/* *************************************
+*  Explicit context
+***************************************/
+/*= Compression context
+ *  When compressing many times,
+ *  it is recommended to allocate a context just once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Note : re-using context is just a speed / resource optimization.
+ *         It doesn't change the compression ratio, which remains identical.
+ *  Note 2 : In multi-threaded environments,
+ *         use one different context per thread for parallel execution.
+ */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* accept NULL pointer */
+
+/*! ZSTD_compressCCtx() :
+ *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+ *  Important : in order to behave similarly to `ZSTD_compress()`,
+ *  this function compresses at requested compression level,
+ *  __ignoring any other parameter__ .
+ *  If any advanced parameter was set using the advanced API,
+ *  they will all be reset. Only `compressionLevel` remains.
+ */
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     int compressionLevel);
+
+/*= Decompression context
+ *  When decompressing many times,
+ *  it is recommended to allocate a context only once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution. */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);  /* accept NULL pointer */
+
+/*! ZSTD_decompressDCtx() :
+ *  Same as ZSTD_decompress(),
+ *  requires an allocated ZSTD_DCtx.
+ *  Compatible with sticky parameters.
+ */
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize);
 
-/**
- * enum ZSTD_ErrorCode - zstd error codes
+
+/* *************************************
+*  Advanced compression API
+***************************************/
+
+/* API design :
+ *   Parameters are pushed one by one into an existing context,
+ *   using ZSTD_CCtx_set*() functions.
+ *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+ *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+ *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
+ *
+ *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
  *
- * Functions that return size_t can be checked for errors using ZSTD_isError()
- * and the ZSTD_ErrorCode can be extracted using ZSTD_getErrorCode().
+ *   This API supercedes all other "advanced" API entry points in the experimental section.
+ *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
+ */
+
+
+/* Compression strategies, listed from fastest to strongest */
+typedef enum { ZSTD_fast=1,
+               ZSTD_dfast=2,
+               ZSTD_greedy=3,
+               ZSTD_lazy=4,
+               ZSTD_lazy2=5,
+               ZSTD_btlazy2=6,
+               ZSTD_btopt=7,
+               ZSTD_btultra=8,
+               ZSTD_btultra2=9
+               /* note : new strategies _might_ be added in the future.
+                         Only the order (from fast to strong) is guaranteed */
+} ZSTD_strategy;
+
+
+typedef enum {
+
+    /* compression parameters
+     * Note: When compressing with a ZSTD_CDict these parameters are superseded
+     * by the parameters used to construct the ZSTD_CDict.
+     * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */
+    ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table.
+                              * Note that exact compression parameters are dynamically determined,
+                              * depending on both compression level and srcSize (when known).
+                              * Default level is ZSTD_CLEVEL_DEFAULT==3.
+                              * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
+                              * Note 1 : it's possible to pass a negative compression level.
+                              * Note 2 : setting a level does not automatically set all other compression parameters
+                              *   to default. Setting this will however eventually dynamically impact the compression
+                              *   parameters which have not been manually set. The manually set
+                              *   ones will 'stick'. */
+    /* Advanced compression parameters :
+     * It's possible to pin down compression parameters to some specific values.
+     * In which case, these values are no longer dynamically selected by the compressor */
+    ZSTD_c_windowLog=101,    /* Maximum allowed back-reference distance, expressed as power of 2.
+                              * This will set a memory budget for streaming decompression,
+                              * with larger values requiring more memory
+                              * and typically compressing more.
+                              * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
+                              * Special: value 0 means "use default windowLog".
+                              * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT
+                              *       requires explicitly allowing such size at streaming decompression stage. */
+    ZSTD_c_hashLog=102,      /* Size of the initial probe table, as a power of 2.
+                              * Resulting memory usage is (1 << (hashLog+2)).
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
+                              * Larger tables improve compression ratio of strategies <= dFast,
+                              * and improve speed of strategies > dFast.
+                              * Special: value 0 means "use default hashLog". */
+    ZSTD_c_chainLog=103,     /* Size of the multi-probe search table, as a power of 2.
+                              * Resulting memory usage is (1 << (chainLog+2)).
+                              * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
+                              * Larger tables result in better and slower compression.
+                              * This parameter is useless for "fast" strategy.
+                              * It's still useful when using "dfast" strategy,
+                              * in which case it defines a secondary probe table.
+                              * Special: value 0 means "use default chainLog". */
+    ZSTD_c_searchLog=104,    /* Number of search attempts, as a power of 2.
+                              * More attempts result in better and slower compression.
+                              * This parameter is useless for "fast" and "dFast" strategies.
+                              * Special: value 0 means "use default searchLog". */
+    ZSTD_c_minMatch=105,     /* Minimum size of searched matches.
+                              * Note that Zstandard can still find matches of smaller size,
+                              * it just tweaks its search algorithm to look for this size and larger.
+                              * Larger values increase compression and decompression speed, but decrease ratio.
+                              * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX.
+                              * Note that currently, for all strategies < btopt, effective minimum is 4.
+                              *                    , for all strategies > fast, effective maximum is 6.
+                              * Special: value 0 means "use default minMatchLength". */
+    ZSTD_c_targetLength=106, /* Impact of this field depends on strategy.
+                              * For strategies btopt, btultra & btultra2:
+                              *     Length of Match considered "good enough" to stop search.
+                              *     Larger values make compression stronger, and slower.
+                              * For strategy fast:
+                              *     Distance between match sampling.
+                              *     Larger values make compression faster, and weaker.
+                              * Special: value 0 means "use default targetLength". */
+    ZSTD_c_strategy=107,     /* See ZSTD_strategy enum definition.
+                              * The higher the value of selected strategy, the more complex it is,
+                              * resulting in stronger and slower compression.
+                              * Special: value 0 means "use default strategy". */
+
+    /* LDM mode parameters */
+    ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                     * This parameter is designed to improve compression ratio
+                                     * for large inputs, by finding large matches at long distance.
+                                     * It increases memory usage and window size.
+                                     * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB
+                                     * except when expressly set to a different value.
+                                     * Note: will be enabled by default if ZSTD_c_windowLog >= 128 MB and
+                                     * compression strategy >= ZSTD_btopt (== compression level 16+) */
+    ZSTD_c_ldmHashLog=161,   /* Size of the table for long distance matching, as a power of 2.
+                              * Larger values increase memory usage and compression ratio,
+                              * but decrease compression speed.
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+                              * default: windowlog - 7.
+                              * Special: value 0 means "automatically determine hashlog". */
+    ZSTD_c_ldmMinMatch=162,  /* Minimum match size for long distance matcher.
+                              * Larger/too small values usually decrease compression ratio.
+                              * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+                              * Special: value 0 means "use default value" (default: 64). */
+    ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution.
+                              * Larger values improve collision resolution but decrease compression speed.
+                              * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX.
+                              * Special: value 0 means "use default value" (default: 3). */
+    ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table.
+                              * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+                              * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+                              * Larger values improve compression speed.
+                              * Deviating far from default value will likely result in a compression ratio decrease.
+                              * Special: value 0 means "automatically determine hashRateLog". */
+
+    /* frame parameters */
+    ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1)
+                              * Content size must be known at the beginning of compression.
+                              * This is automatically the case when using ZSTD_compress2(),
+                              * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */
+    ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */
+    ZSTD_c_dictIDFlag=202,   /* When applicable, dictionary's ID is written into frame header (default:1) */
+
+    /* multi-threading parameters */
+    /* These parameters are only active if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD).
+     * Otherwise, trying to set any other value than default (0) will be a no-op and return an error.
+     * In a situation where it's unknown if the linked library supports multi-threading or not,
+     * setting ZSTD_c_nbWorkers to any value >= 1 and consulting the return value provides a quick way to check this property.
+     */
+    ZSTD_c_nbWorkers=400,    /* Select how many threads will be spawned to compress in parallel.
+                              * When nbWorkers >= 1, triggers asynchronous mode when invoking ZSTD_compressStream*() :
+                              * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller,
+                              * while compression is performed in parallel, within worker thread(s).
+                              * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end :
+                              *  in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call).
+                              * More workers improve speed, but also increase memory usage.
+                              * Default value is `0`, aka "single-threaded mode" : no worker is spawned,
+                              * compression is performed inside Caller's thread, and all invocations are blocking */
+    ZSTD_c_jobSize=401,      /* Size of a compression job. This value is enforced only when nbWorkers >= 1.
+                              * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads.
+                              * 0 means default, which is dynamically determined based on compression parameters.
+                              * Job size must be a minimum of overlap size, or 1 MB, whichever is largest.
+                              * The minimum size is automatically and transparently enforced. */
+    ZSTD_c_overlapLog=402,   /* Control the overlap size, as a fraction of window size.
+                              * The overlap size is an amount of data reloaded from previous job at the beginning of a new job.
+                              * It helps preserve compression ratio, while each job is compressed in parallel.
+                              * This value is enforced only when nbWorkers >= 1.
+                              * Larger values increase compression ratio, but decrease speed.
+                              * Possible values range from 0 to 9 :
+                              * - 0 means "default" : value will be determined by the library, depending on strategy
+                              * - 1 means "no overlap"
+                              * - 9 means "full overlap", using a full window size.
+                              * Each intermediate rank increases/decreases load size by a factor 2 :
+                              * 9: full window;  8: w/2;  7: w/4;  6: w/8;  5:w/16;  4: w/32;  3:w/64;  2:w/128;  1:no overlap;  0:default
+                              * default value varies between 6 and 9, depending on strategy */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_c_rsyncable
+     * ZSTD_c_format
+     * ZSTD_c_forceMaxWindow
+     * ZSTD_c_forceAttachDict
+     * ZSTD_c_literalCompressionMode
+     * ZSTD_c_targetCBlockSize
+     * ZSTD_c_srcSizeHint
+     * ZSTD_c_enableDedicatedDictSearch
+     * ZSTD_c_stableInBuffer
+     * ZSTD_c_stableOutBuffer
+     * ZSTD_c_blockDelimiters
+     * ZSTD_c_validateSequences
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly;
+     *        also, the enums values themselves are unstable and can still change.
+     */
+     ZSTD_c_experimentalParam1=500,
+     ZSTD_c_experimentalParam2=10,
+     ZSTD_c_experimentalParam3=1000,
+     ZSTD_c_experimentalParam4=1001,
+     ZSTD_c_experimentalParam5=1002,
+     ZSTD_c_experimentalParam6=1003,
+     ZSTD_c_experimentalParam7=1004,
+     ZSTD_c_experimentalParam8=1005,
+     ZSTD_c_experimentalParam9=1006,
+     ZSTD_c_experimentalParam10=1007,
+     ZSTD_c_experimentalParam11=1008,
+     ZSTD_c_experimentalParam12=1009
+} ZSTD_cParameter;
+
+typedef struct {
+    size_t error;
+    int lowerBound;
+    int upperBound;
+} ZSTD_bounds;
+
+/*! ZSTD_cParam_getBounds() :
+ *  All parameters must belong to an interval with lower and upper bounds,
+ *  otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ *         - an error status field, which must be tested using ZSTD_isError()
+ *         - lower and upper bounds, both inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam);
+
+/*! ZSTD_CCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds().
+ *  Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ *  Setting a parameter is generally only possible during frame initialization (before starting compression).
+ *  Exception : when using multi-threading mode (nbWorkers >= 1),
+ *              the following parameters can be updated _during_ compression (within same frame):
+ *              => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+ *              new parameters will be active for next job only (after a flush()).
+ * @return : an error code (which can be tested using ZSTD_isError()).
  */
+ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtx_setPledgedSrcSize() :
+ *  Total input data size to be compressed as a single frame.
+ *  Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag.
+ *  This value will also be controlled at end of frame, and trigger an error if not respected.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame.
+ *           In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN.
+ *           ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame.
+ *  Note 2 : pledgedSrcSize is only valid once, for the next frame.
+ *           It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN.
+ *  Note 3 : Whenever all input data is provided and consumed in a single round,
+ *           for example with ZSTD_compress2(),
+ *           or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end),
+ *           this value is automatically overridden by srcSize instead.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
+
 typedef enum {
-	ZSTD_error_no_error,
-	ZSTD_error_GENERIC,
-	ZSTD_error_prefix_unknown,
-	ZSTD_error_version_unsupported,
-	ZSTD_error_parameter_unknown,
-	ZSTD_error_frameParameter_unsupported,
-	ZSTD_error_frameParameter_unsupportedBy32bits,
-	ZSTD_error_frameParameter_windowTooLarge,
-	ZSTD_error_compressionParameter_unsupported,
-	ZSTD_error_init_missing,
-	ZSTD_error_memory_allocation,
-	ZSTD_error_stage_wrong,
-	ZSTD_error_dstSize_tooSmall,
-	ZSTD_error_srcSize_wrong,
-	ZSTD_error_corruption_detected,
-	ZSTD_error_checksum_wrong,
-	ZSTD_error_tableLog_tooLarge,
-	ZSTD_error_maxSymbolValue_tooLarge,
-	ZSTD_error_maxSymbolValue_tooSmall,
-	ZSTD_error_dictionary_corrupted,
-	ZSTD_error_dictionary_wrong,
-	ZSTD_error_dictionaryCreation_failed,
-	ZSTD_error_maxCode
-} ZSTD_ErrorCode;
-
-/**
- * ZSTD_maxCLevel() - maximum compression level available
- *
- * Return: Maximum compression level available.
- */
-int ZSTD_maxCLevel(void);
-/**
- * ZSTD_compressBound() - maximum compressed size in worst case scenario
- * @srcSize: The size of the data to compress.
- *
- * Return:   The maximum compressed size in the worst case scenario.
- */
-size_t ZSTD_compressBound(size_t srcSize);
-/**
- * ZSTD_isError() - tells if a size_t function result is an error code
- * @code:  The function result to check for error.
- *
- * Return: Non-zero iff the code is an error.
- */
-static __attribute__((unused)) unsigned int ZSTD_isError(size_t code)
-{
-	return code > (size_t)-ZSTD_error_maxCode;
-}
-/**
- * ZSTD_getErrorCode() - translates an error function result to a ZSTD_ErrorCode
- * @functionResult: The result of a function for which ZSTD_isError() is true.
- *
- * Return:          The ZSTD_ErrorCode corresponding to the functionResult or 0
- *                  if the functionResult isn't an error.
- */
-static __attribute__((unused)) ZSTD_ErrorCode ZSTD_getErrorCode(
-	size_t functionResult)
-{
-	if (!ZSTD_isError(functionResult))
-		return (ZSTD_ErrorCode)0;
-	return (ZSTD_ErrorCode)(0 - functionResult);
-}
-
-/**
- * enum ZSTD_strategy - zstd compression search strategy
- *
- * From faster to stronger.
+    ZSTD_reset_session_only = 1,
+    ZSTD_reset_parameters = 2,
+    ZSTD_reset_session_and_parameters = 3
+} ZSTD_ResetDirective;
+
+/*! ZSTD_CCtx_reset() :
+ *  There are 2 different things that can be reset, independently or jointly :
+ *  - The session : will stop compressing current frame, and make CCtx ready to start a new one.
+ *                  Useful after an error, or to interrupt any ongoing compression.
+ *                  Any internal data not yet flushed is cancelled.
+ *                  Compression parameters and dictionary remain unchanged.
+ *                  They will be used to compress next frame.
+ *                  Resetting session never fails.
+ *  - The parameters : changes all parameters back to "default".
+ *                  This removes any reference to any dictionary too.
+ *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+ *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+ *  - Both : similar to resetting the session, followed by resetting parameters.
  */
+ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+
+/*! ZSTD_compress2() :
+ *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
+ *  ZSTD_compress2() always starts a new frame.
+ *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ *  - The function is always blocking, returns when compression is completed.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ * @return : compressed size written into `dst` (<= `dstCapacity),
+ *           or an error code if it fails (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx,
+                                   void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+
+/* *************************************
+*  Advanced decompression API
+***************************************/
+
+/* The advanced API pushes parameters one by one into an existing DCtx context.
+ * Parameters are sticky, and remain valid for all following frames
+ * using the same DCtx context.
+ * It's possible to reset parameters to default values using ZSTD_DCtx_reset().
+ * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream().
+ *        Therefore, no new decompression function is necessary.
+ */
+
 typedef enum {
-	ZSTD_fast,
-	ZSTD_dfast,
-	ZSTD_greedy,
-	ZSTD_lazy,
-	ZSTD_lazy2,
-	ZSTD_btlazy2,
-	ZSTD_btopt,
-	ZSTD_btopt2
-} ZSTD_strategy;
 
-/**
- * struct ZSTD_compressionParameters - zstd compression parameters
- * @windowLog:    Log of the largest match distance. Larger means more
- *                compression, and more memory needed during decompression.
- * @chainLog:     Fully searched segment. Larger means more compression, slower,
- *                and more memory (useless for fast).
- * @hashLog:      Dispatch table. Larger means more compression,
- *                slower, and more memory.
- * @searchLog:    Number of searches. Larger means more compression and slower.
- * @searchLength: Match length searched. Larger means faster decompression,
- *                sometimes less compression.
- * @targetLength: Acceptable match size for optimal parser (only). Larger means
- *                more compression, and slower.
- * @strategy:     The zstd compression strategy.
+    ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which
+                              * the streaming API will refuse to allocate memory buffer
+                              * in order to protect the host from unreasonable memory requirements.
+                              * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+                              * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT).
+                              * Special: value 0 means "use default maximum windowLog". */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_d_format
+     * ZSTD_d_stableOutBuffer
+     * ZSTD_d_forceIgnoreChecksum
+     * ZSTD_d_refMultipleDDicts
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly
+     */
+     ZSTD_d_experimentalParam1=1000,
+     ZSTD_d_experimentalParam2=1001,
+     ZSTD_d_experimentalParam3=1002,
+     ZSTD_d_experimentalParam4=1003
+
+} ZSTD_dParameter;
+
+/*! ZSTD_dParam_getBounds() :
+ *  All parameters must belong to an interval with lower and upper bounds,
+ *  otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ *         - an error status field, which must be tested using ZSTD_isError()
+ *         - both lower and upper bounds, inclusive
  */
-typedef struct {
-	unsigned int windowLog;
-	unsigned int chainLog;
-	unsigned int hashLog;
-	unsigned int searchLog;
-	unsigned int searchLength;
-	unsigned int targetLength;
-	ZSTD_strategy strategy;
-} ZSTD_compressionParameters;
+ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam);
+
+/*! ZSTD_DCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_dParameter.
+ *  All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds().
+ *  Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ *  Setting a parameter is only possible during frame initialization (before starting decompression).
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value);
+
+/*! ZSTD_DCtx_reset() :
+ *  Return a DCtx to clean state.
+ *  Session and parameters can be reset jointly or separately.
+ *  Parameters can only be reset when no active frame is being decompressed.
+ * @return : 0, or an error code, which can be tested with ZSTD_isError()
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset);
+
+
+/* **************************
+*  Streaming
+****************************/
+
+typedef struct ZSTD_inBuffer_s {
+  const void* src;    /*< start of input buffer */
+  size_t size;        /*< size of input buffer */
+  size_t pos;         /*< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_inBuffer;
+
+typedef struct ZSTD_outBuffer_s {
+  void*  dst;         /*< start of output buffer */
+  size_t size;        /*< size of output buffer */
+  size_t pos;         /*< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_outBuffer;
+
 
-/**
- * struct ZSTD_frameParameters - zstd frame parameters
- * @contentSizeFlag: Controls whether content size will be present in the frame
- *                   header (when known).
- * @checksumFlag:    Controls whether a 32-bit checksum is generated at the end
- *                   of the frame for error detection.
- * @noDictIDFlag:    Controls whether dictID will be saved into the frame header
- *                   when using dictionary compression.
+
+/*-***********************************************************************
+*  Streaming compression - HowTo
+*
+*  A ZSTD_CStream object is required to track streaming operation.
+*  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+*  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+*
+*  For parallel execution, use one separate ZSTD_CStream per thread.
+*
+*  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+*
+*  Parameters are sticky : when starting a new compression on the same context,
+*  it will re-use the same sticky parameters as previous compression session.
+*  When in doubt, it's recommended to fully initialize the context before usage.
+*  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+*  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+*  set more specific parameters, the pledged source size, or load a dictionary.
+*
+*  Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to
+*  consume input stream. The function will automatically update both `pos`
+*  fields within `input` and `output`.
+*  Note that the function may not consume the entire input, for example, because
+*  the output buffer is already full, in which case `input.pos < input.size`.
+*  The caller must check if input has been entirely consumed.
+*  If not, the caller must make some room to receive more compressed data,
+*  and then present again remaining input data.
+*  note: ZSTD_e_continue is guaranteed to make some forward progress when called,
+*        but doesn't guarantee maximal forward progress. This is especially relevant
+*        when compressing with multiple threads. The call won't block if it can
+*        consume some input, but if it can't it will wait for some, but not all,
+*        output to be flushed.
+* @return : provides a minimum amount of data remaining to be flushed from internal buffers
+*           or an error code, which can be tested using ZSTD_isError().
+*
+*  At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+*  using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated.
+*  Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0).
+*  In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the
+*  operation.
+*  note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if internal buffers are entirely flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+*  Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to
+*  start a new frame.
+*  note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if frame fully completed and fully flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef ZSTD_CCtx ZSTD_CStream;  /*< CCtx and CStream are now effectively same object (>= v1.3.0) */
+                                 /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
+/*===== ZSTD_CStream management functions =====*/
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);  /* accept NULL pointer */
+
+/*===== Streaming compression functions =====*/
+typedef enum {
+    ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */
+    ZSTD_e_flush=1,    /* flush any data provided so far,
+                        * it creates (at least) one new block, that can be decoded immediately on reception;
+                        * frame will continue: any future data can still reference previously compressed data, improving compression.
+                        * note : multithreaded compression will block to flush as much output as possible. */
+    ZSTD_e_end=2       /* flush any remaining data _and_ close current frame.
+                        * note that frame is only closed after compressed data is fully flushed (return value == 0).
+                        * After that point, any additional data starts a new frame.
+                        * note : each frame is independent (does not reference any content from previous frame).
+                        : note : multithreaded compression will block to flush as much output as possible. */
+} ZSTD_EndDirective;
+
+/*! ZSTD_compressStream2() :
+ *  Behaves about the same as ZSTD_compressStream, with additional control on end directive.
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ *  - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode)
+ *  - output->pos must be <= dstCapacity, input->pos must be <= srcSize
+ *  - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit.
+ *  - endOp must be a valid directive
+ *  - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller.
+ *  - When nbWorkers>=1, function is non-blocking : it copies a portion of input, distributes jobs to internal worker threads, flush to output whatever is available,
+ *                                                  and then immediately returns, just indicating that there is some data remaining to be flushed.
+ *                                                  The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte.
+ *  - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking.
+ *  - @return provides a minimum amount of data remaining to be flushed from internal buffers
+ *            or an error code, which can be tested using ZSTD_isError().
+ *            if @return != 0, flush is not fully completed, there is still some data left within internal buffers.
+ *            This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers.
+ *            For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed.
+ *  - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0),
+ *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+ *            Before starting a new compression job, or changing compression parameters,
+ *            it is required to fully flush internal buffers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                         ZSTD_outBuffer* output,
+                                         ZSTD_inBuffer* input,
+                                         ZSTD_EndDirective endOp);
+
+
+/* These buffer sizes are softly recommended.
+ * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output.
+ * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(),
+ * reducing the amount of memory shuffling and buffering, resulting in minor performance savings.
+ *
+ * However, note that these recommendations are from the perspective of a C caller program.
+ * If the streaming interface is invoked from some other language,
+ * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo,
+ * a major performance rule is to reduce crossing such interface to an absolute minimum.
+ * It's not rare that performance ends being spent more into the interface, rather than compression itself.
+ * In which cases, prefer using large buffers, as large as practical,
+ * for both input and output, to reduce the nb of roundtrips.
+ */
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void);    /*< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */
+
+
+/* *****************************************************************************
+ * This following is a legacy streaming API.
+ * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+ * It is redundant, but remains fully supported.
+ * Advanced parameters and dictionary compression can only be used through the
+ * new API.
+ ******************************************************************************/
+
+/*!
+ * Equivalent to:
  *
- * The default value is all fields set to 0.
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ */
+ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+/*!
+ * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue).
+ * NOTE: The return value is different. ZSTD_compressStream() returns a hint for
+ * the next read size (if non-zero and not an error). ZSTD_compressStream2()
+ * returns the minimum nb of bytes left to flush (if non-zero and not an error).
  */
+ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */
+ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */
+ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+
+
+/*-***************************************************************************
+*  Streaming decompression - HowTo
+*
+*  A ZSTD_DStream object is required to track streaming operations.
+*  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+*  ZSTD_DStream objects can be re-used multiple times.
+*
+*  Use ZSTD_initDStream() to start a new decompression operation.
+* @return : recommended first input size
+*  Alternatively, use advanced API to set specific properties.
+*
+*  Use ZSTD_decompressStream() repetitively to consume your input.
+*  The function will update both `pos` fields.
+*  If `input.pos < input.size`, some input has not been consumed.
+*  It's up to the caller to present again remaining data.
+*  The function tries to flush all data decoded immediately, respecting output buffer size.
+*  If `output.pos < output.size`, decoder has flushed everything it could.
+*  But if `output.pos == output.size`, there might be some data left within internal buffers.,
+*  In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+*  Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+* @return : 0 when a frame is completely decoded and fully flushed,
+*        or an error code, which can be tested using ZSTD_isError(),
+*        or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+*                                the return value is a suggested next input size (just a hint for better latency)
+*                                that will never request more than the remaining frame size.
+* *******************************************************************************/
+
+typedef ZSTD_DCtx ZSTD_DStream;  /*< DCtx and DStream are now effectively same object (>= v1.3.0) */
+                                 /* For compatibility with versions <= v1.2.0, prefer differentiating them. */
+/*===== ZSTD_DStream management functions =====*/
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer */
+
+/*===== Streaming decompression functions =====*/
+
+/* This function is redundant with the advanced API and equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_refDDict(zds, NULL);
+ */
+ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+
+ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
+
+/* ************************
+*  Simple dictionary API
+***************************/
+/*! ZSTD_compress_usingDict() :
+ *  Compression at an explicit compression level using a Dictionary.
+ *  A dictionary can be any arbitrary data segment (also called a prefix),
+ *  or a buffer with specified information (see dictBuilder/zdict.h).
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *         It's intended for a dictionary used only once.
+ *  Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
+                                           int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+ *  Decompression using a known Dictionary.
+ *  Dictionary must be identical to the one used during compression.
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *         It's intended for a dictionary used only once.
+ *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/* *********************************
+ *  Bulk processing dictionary API
+ **********************************/
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/*! ZSTD_createCDict() :
+ *  When compressing multiple messages or blocks using the same dictionary,
+ *  it's recommended to digest the dictionary only once, since it's a costly operation.
+ *  ZSTD_createCDict() will create a state from digesting a dictionary.
+ *  The resulting state can be used for future compression operations with very limited startup cost.
+ *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict.
+ *  Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content.
+ *  Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer,
+ *      in which case the only thing that it transports is the @compressionLevel.
+ *      This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively,
+ *      expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
+                                         int compressionLevel);
+
+/*! ZSTD_freeCDict() :
+ *  Function frees memory allocated by ZSTD_createCDict().
+ *  If a NULL pointer is passed, no operation is performed. */
+ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Recommended when same dictionary is used multiple times.
+ *  Note : compression level is _decided at dictionary creation time_,
+ *     and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const ZSTD_CDict* cdict);
+
+
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/*! ZSTD_createDDict() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  dictBuffer can be released after DDict creation, as its content is copied inside DDict. */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_freeDDict() :
+ *  Function frees memory allocated with ZSTD_createDDict()
+ *  If a NULL pointer is passed, no operation is performed. */
+ZSTDLIB_API size_t      ZSTD_freeDDict(ZSTD_DDict* ddict);
+
+/*! ZSTD_decompress_usingDDict() :
+ *  Decompression using a digested Dictionary.
+ *  Recommended when same dictionary is used multiple times. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_DDict* ddict);
+
+
+/* ******************************
+ *  Dictionary helper functions
+ *******************************/
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize);
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary to be decoded (most common case).
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+
+
+/* *****************************************************************************
+ * Advanced dictionary and prefix API
+ *
+ * This API allows dictionaries to be used with ZSTD_compress2(),
+ * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and
+ * only reset with the context is reset with ZSTD_reset_parameters or
+ * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ ******************************************************************************/
+
+
+/*! ZSTD_CCtx_loadDictionary() :
+ *  Create an internal CDict from `dict` buffer.
+ *  Decompression will have to use same dictionary.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+ *           meaning "return to no-dictionary mode".
+ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+ *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
+ *  Note 2 : Loading a dictionary involves building tables.
+ *           It's also a CPU consuming operation, with non-negligible impact on latency.
+ *           Tables are dependent on compression parameters, and for this reason,
+ *           compression parameters can no longer be changed after loading a dictionary.
+ *  Note 3 :`dict` content will be copied internally.
+ *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+ *           In such a case, dictionary buffer must outlive its users.
+ *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+ *           to precisely select how dictionary content must be interpreted. */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_refCDict() :
+ *  Reference a prepared dictionary, to be used for all next compressed frames.
+ *  Note that compression parameters are enforced from within CDict,
+ *  and supersede any compression parameter previously set within CCtx.
+ *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+ *  The ignored parameters will be used again if the CCtx is returned to no-dictionary mode.
+ *  The dictionary will remain valid for future compressed frames using same CCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Referencing a NULL CDict means "return to no-dictionary mode".
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */
+ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+/*! ZSTD_CCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) for next compressed frame.
+ *  A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end).
+ *  Decompression will need same prefix to properly regenerate data.
+ *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+ *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+ *           Its content must remain unmodified during compression.
+ *  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ *           ensure that the window size is large enough to contain the entire source.
+ *           See ZSTD_c_windowLog.
+ *  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *           It's a CPU consuming operation, with non-negligible impact on latency.
+ *           If there is a need to use the same prefix multiple times, consider loadDictionary instead.
+ *  Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent).
+ *           Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                 const void* prefix, size_t prefixSize);
+
+/*! ZSTD_DCtx_loadDictionary() :
+ *  Create an internal DDict from dict buffer,
+ *  to be used to decompress next frames.
+ *  The dictionary remains valid for all future frames, until explicitly invalidated.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+ *            meaning "return to no-dictionary mode".
+ *  Note 1 : Loading a dictionary involves building tables,
+ *           which has a non-negligible impact on CPU usage and latency.
+ *           It's recommended to "load once, use many times", to amortize the cost
+ *  Note 2 :`dict` content will be copied internally, so `dict` can be released after loading.
+ *           Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead.
+ *  Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of
+ *           how dictionary content is loaded and interpreted.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_refDDict() :
+ *  Reference a prepared dictionary, to be used to decompress next frames.
+ *  The dictionary remains active for decompression of future frames using same DCtx.
+ *
+ *  If called with ZSTD_d_refMultipleDDicts enabled, repeated calls of this function
+ *  will store the DDict references in a table, and the DDict used for decompression
+ *  will be determined at decompression time, as per the dict ID in the frame.
+ *  The memory for the table is allocated on the first call to refDDict, and can be
+ *  freed with ZSTD_freeDCtx().
+ *
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+ *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+/*! ZSTD_DCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) to decompress next frame.
+ *  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ *  and must use the same prefix as the one used during compression.
+ *  Prefix is **only used once**. Reference is discarded at end of frame.
+ *  End of frame is reached when ZSTD_decompressStream() returns 0.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
+ *  Note 2 : Prefix buffer is referenced. It **must** outlive decompression.
+ *           Prefix buffer must remain unmodified up to the end of frame,
+ *           reached when ZSTD_decompressStream() returns 0.
+ *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent).
+ *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section)
+ *  Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+ *           A full dictionary is more costly, as it requires building tables.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
+                                 const void* prefix, size_t prefixSize);
+
+/* ===   Memory management   === */
+
+/*! ZSTD_sizeof_*() :
+ *  These functions give the _current_ memory usage of selected object.
+ *  Note that object memory usage can evolve (increase or decrease) over time. */
+ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+
+#endif  /* ZSTD_H_235446 */
+
+
+/* **************************************************************************************
+ *   ADVANCED AND EXPERIMENTAL FUNCTIONS
+ ****************************************************************************************
+ * The definitions in the following section are considered experimental.
+ * They are provided for advanced scenarios.
+ * They should never be used with a dynamic library, as prototypes may change in the future.
+ * Use them only in association with static linking.
+ * ***************************************************************************************/
+
+#if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
+/* **************************************************************************************
+ *   experimental API (static linking only)
+ ****************************************************************************************
+ * The following symbols and constants
+ * are not planned to join "stable API" status in the near future.
+ * They can still change in future versions.
+ * Some of them are planned to remain in the static_only section indefinitely.
+ * Some of them might be removed in the future (especially when redundant with existing stable functions)
+ * ***************************************************************************************/
+
+#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1)   /* minimum input size required to query frame header size */
+#define ZSTD_FRAMEHEADERSIZE_MIN(format)    ((format) == ZSTD_f_zstd1 ? 6 : 2)
+#define ZSTD_FRAMEHEADERSIZE_MAX   18   /* can be useful for static allocation */
+#define ZSTD_SKIPPABLEHEADERSIZE    8
+
+/* compression parameter bounds */
+#define ZSTD_WINDOWLOG_MAX_32    30
+#define ZSTD_WINDOWLOG_MAX_64    31
+#define ZSTD_WINDOWLOG_MAX     ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN       10
+#define ZSTD_HASHLOG_MAX       ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
+#define ZSTD_HASHLOG_MIN          6
+#define ZSTD_CHAINLOG_MAX_32     29
+#define ZSTD_CHAINLOG_MAX_64     30
+#define ZSTD_CHAINLOG_MAX      ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
+#define ZSTD_CHAINLOG_MIN        ZSTD_HASHLOG_MIN
+#define ZSTD_SEARCHLOG_MAX      (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN        1
+#define ZSTD_MINMATCH_MAX         7   /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_MINMATCH_MIN         3   /* only for ZSTD_btopt+, faster strategies are limited to 4 */
+#define ZSTD_TARGETLENGTH_MAX    ZSTD_BLOCKSIZE_MAX
+#define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+#define ZSTD_STRATEGY_MIN        ZSTD_fast
+#define ZSTD_STRATEGY_MAX        ZSTD_btultra2
+
+
+#define ZSTD_OVERLAPLOG_MIN       0
+#define ZSTD_OVERLAPLOG_MAX       9
+
+#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27   /* by default, the streaming decoder will refuse any frame
+                                           * requiring larger than (1<<ZSTD_WINDOWLOG_LIMIT_DEFAULT) window size,
+                                           * to preserve host's memory from unreasonable requirements.
+                                           * This limit can be overridden using ZSTD_DCtx_setParameter(,ZSTD_d_windowLogMax,).
+                                           * The limit does not apply for one-pass decoders (such as ZSTD_decompress()), since no additional memory is allocated */
+
+
+/* LDM parameter bounds */
+#define ZSTD_LDM_HASHLOG_MIN      ZSTD_HASHLOG_MIN
+#define ZSTD_LDM_HASHLOG_MAX      ZSTD_HASHLOG_MAX
+#define ZSTD_LDM_MINMATCH_MIN        4
+#define ZSTD_LDM_MINMATCH_MAX     4096
+#define ZSTD_LDM_BUCKETSIZELOG_MIN   1
+#define ZSTD_LDM_BUCKETSIZELOG_MAX   8
+#define ZSTD_LDM_HASHRATELOG_MIN     0
+#define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+
+/* Advanced parameter bounds */
+#define ZSTD_TARGETCBLOCKSIZE_MIN   64
+#define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+#define ZSTD_SRCSIZEHINT_MIN        0
+#define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+
+/* internal */
+#define ZSTD_HASHLOG3_MAX           17
+
+
+/* ---  Advanced types  --- */
+
+typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
+
 typedef struct {
-	unsigned int contentSizeFlag;
-	unsigned int checksumFlag;
-	unsigned int noDictIDFlag;
+    unsigned int offset;      /* The offset of the match. (NOT the same as the offset code)
+                               * If offset == 0 and matchLength == 0, this sequence represents the last
+                               * literals in the block of litLength size.
+                               */
+
+    unsigned int litLength;   /* Literal length of the sequence. */
+    unsigned int matchLength; /* Match length of the sequence. */
+
+                              /* Note: Users of this API may provide a sequence with matchLength == litLength == offset == 0.
+                               * In this case, we will treat the sequence as a marker for a block boundary.
+                               */
+
+    unsigned int rep;         /* Represents which repeat offset is represented by the field 'offset'.
+                               * Ranges from [0, 3].
+                               *
+                               * Repeat offsets are essentially previous offsets from previous sequences sorted in
+                               * recency order. For more detail, see doc/zstd_compression_format.md
+                               *
+                               * If rep == 0, then 'offset' does not contain a repeat offset.
+                               * If rep > 0:
+                               *  If litLength != 0:
+                               *      rep == 1 --> offset == repeat_offset_1
+                               *      rep == 2 --> offset == repeat_offset_2
+                               *      rep == 3 --> offset == repeat_offset_3
+                               *  If litLength == 0:
+                               *      rep == 1 --> offset == repeat_offset_2
+                               *      rep == 2 --> offset == repeat_offset_3
+                               *      rep == 3 --> offset == repeat_offset_1 - 1
+                               *
+                               * Note: This field is optional. ZSTD_generateSequences() will calculate the value of
+                               * 'rep', but repeat offsets do not necessarily need to be calculated from an external
+                               * sequence provider's perspective. For example, ZSTD_compressSequences() does not
+                               * use this 'rep' field at all (as of now).
+                               */
+} ZSTD_Sequence;
+
+typedef struct {
+    unsigned windowLog;       /*< largest match distance : larger == more compression, more memory needed during decompression */
+    unsigned chainLog;        /*< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+    unsigned hashLog;         /*< dispatch table : larger == faster, more memory */
+    unsigned searchLog;       /*< nb of searches : larger == more compression, slower */
+    unsigned minMatch;        /*< match length searched : larger == faster decompression, sometimes less compression */
+    unsigned targetLength;    /*< acceptable match size for optimal parser (only) : larger == more compression, slower */
+    ZSTD_strategy strategy;   /*< see ZSTD_strategy definition above */
+} ZSTD_compressionParameters;
+
+typedef struct {
+    int contentSizeFlag; /*< 1: content size will be in frame header (when known) */
+    int checksumFlag;    /*< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */
+    int noDictIDFlag;    /*< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */
 } ZSTD_frameParameters;
 
-/**
- * struct ZSTD_parameters - zstd parameters
- * @cParams: The compression parameters.
- * @fParams: The frame parameters.
- */
 typedef struct {
-	ZSTD_compressionParameters cParams;
-	ZSTD_frameParameters fParams;
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
 } ZSTD_parameters;
 
-/**
- * ZSTD_getCParams() - returns ZSTD_compressionParameters for selected level
- * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
- * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
- * @dictSize:         The dictionary size or 0 if a dictionary isn't being used.
- *
- * Return:            The selected ZSTD_compressionParameters.
+typedef enum {
+    ZSTD_dct_auto = 0,       /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+    ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+    ZSTD_dct_fullDict = 2    /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */
+} ZSTD_dictContentType_e;
+
+typedef enum {
+    ZSTD_dlm_byCopy = 0,  /*< Copy dictionary content internally */
+    ZSTD_dlm_byRef = 1    /*< Reference dictionary content -- the dictionary buffer must outlive its users. */
+} ZSTD_dictLoadMethod_e;
+
+typedef enum {
+    ZSTD_f_zstd1 = 0,           /* zstd frame format, specified in zstd_compression_format.md (default) */
+    ZSTD_f_zstd1_magicless = 1  /* Variant of zstd frame format, without initial 4-bytes magic number.
+                                 * Useful to save 4 bytes per generated frame.
+                                 * Decoder cannot recognise automatically this format, requiring this instruction. */
+} ZSTD_format_e;
+
+typedef enum {
+    /* Note: this enum controls ZSTD_d_forceIgnoreChecksum */
+    ZSTD_d_validateChecksum = 0,
+    ZSTD_d_ignoreChecksum = 1
+} ZSTD_forceIgnoreChecksum_e;
+
+typedef enum {
+    /* Note: this enum controls ZSTD_d_refMultipleDDicts */
+    ZSTD_rmd_refSingleDDict = 0,
+    ZSTD_rmd_refMultipleDDicts = 1
+} ZSTD_refMultipleDDicts_e;
+
+typedef enum {
+    /* Note: this enum and the behavior it controls are effectively internal
+     * implementation details of the compressor. They are expected to continue
+     * to evolve and should be considered only in the context of extremely
+     * advanced performance tuning.
+     *
+     * Zstd currently supports the use of a CDict in three ways:
+     *
+     * - The contents of the CDict can be copied into the working context. This
+     *   means that the compression can search both the dictionary and input
+     *   while operating on a single set of internal tables. This makes
+     *   the compression faster per-byte of input. However, the initial copy of
+     *   the CDict's tables incurs a fixed cost at the beginning of the
+     *   compression. For small compressions (< 8 KB), that copy can dominate
+     *   the cost of the compression.
+     *
+     * - The CDict's tables can be used in-place. In this model, compression is
+     *   slower per input byte, because the compressor has to search two sets of
+     *   tables. However, this model incurs no start-up cost (as long as the
+     *   working context's tables can be reused). For small inputs, this can be
+     *   faster than copying the CDict's tables.
+     *
+     * - The CDict's tables are not used at all, and instead we use the working
+     *   context alone to reload the dictionary and use params based on the source
+     *   size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict().
+     *   This method is effective when the dictionary sizes are very small relative
+     *   to the input size, and the input size is fairly large to begin with.
+     *
+     * Zstd has a simple internal heuristic that selects which strategy to use
+     * at the beginning of a compression. However, if experimentation shows that
+     * Zstd is making poor choices, it is possible to override that choice with
+     * this enum.
+     */
+    ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */
+    ZSTD_dictForceAttach   = 1, /* Never copy the dictionary. */
+    ZSTD_dictForceCopy     = 2, /* Always copy the dictionary. */
+    ZSTD_dictForceLoad     = 3  /* Always reload the dictionary */
+} ZSTD_dictAttachPref_e;
+
+typedef enum {
+  ZSTD_lcm_auto = 0,          /*< Automatically determine the compression mode based on the compression level.
+                               *   Negative compression levels will be uncompressed, and positive compression
+                               *   levels will be compressed. */
+  ZSTD_lcm_huffman = 1,       /*< Always attempt Huffman compression. Uncompressed literals will still be
+                               *   emitted if Huffman compression is not profitable. */
+  ZSTD_lcm_uncompressed = 2   /*< Always emit uncompressed literals. */
+} ZSTD_literalCompressionMode_e;
+
+
+/* *************************************
+*  Frame size functions
+***************************************/
+
+/*! ZSTD_findDecompressedSize() :
+ *  `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary at `src + srcSize`)
+ *  @return : - decompressed size of all data in all successive frames
+ *            - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN
+ *            - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ *   note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 3 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure result fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to
+ *            read each contained frame header.  This is fast as most of the data is skipped,
+ *            however it does mean that all frame data must be present and valid. */
+ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_decompressBound() :
+ *  `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary at `src + srcSize`)
+ *  @return : - upper-bound for the decompressed size of all data in all successive frames
+ *            - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ *  note 1  : an error can occur if `src` contains an invalid or incorrectly formatted frame.
+ *  note 2  : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`.
+ *            in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value.
+ *  note 3  : when the decompressed size field isn't available, the upper-bound for that frame is calculated by:
+ *              upper-bound = # blocks * min(128 KB, Window_Size)
  */
-ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel,
-	unsigned long long estimatedSrcSize, size_t dictSize);
+ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize);
 
-/**
- * ZSTD_getParams() - returns ZSTD_parameters for selected level
- * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
- * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
- * @dictSize:         The dictionary size or 0 if a dictionary isn't being used.
+/*! ZSTD_frameHeaderSize() :
+ *  srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
+ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+
+typedef enum {
+  ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+  ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
+} ZSTD_sequenceFormat_e;
+
+/*! ZSTD_generateSequences() :
+ * Generate sequences using ZSTD_compress2, given a source buffer.
+ *
+ * Each block will end with a dummy sequence
+ * with offset == 0, matchLength == 0, and litLength == length of last literals.
+ * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
+ * simply acts as a block delimiter.
  *
- * The same as ZSTD_getCParams() except also selects the default frame
- * parameters (all zero).
+ * zc can be used to insert custom compression params.
+ * This function invokes ZSTD_compress2
  *
- * Return:            The selected ZSTD_parameters.
+ * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+ * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+ * @return : number of sequences generated
  */
-ZSTD_parameters ZSTD_getParams(int compressionLevel,
-	unsigned long long estimatedSrcSize, size_t dictSize);
 
-/*-*************************************
- * Explicit memory management
- **************************************/
+ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+                                          size_t outSeqsSize, const void* src, size_t srcSize);
 
-/**
- * ZSTD_CCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_CCtx
- * @cParams: The compression parameters to be used for compression.
+/*! ZSTD_mergeBlockDelimiters() :
+ * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
+ * by merging them into into the literals of the next sequence.
  *
- * If multiple compression parameters might be used, the caller must call
- * ZSTD_CCtxWorkspaceBound() for each set of parameters and use the maximum
- * size.
+ * As such, the final generated result has no explicit representation of block boundaries,
+ * and the final last literals segment is not represented in the sequences.
  *
- * Return:   A lower bound on the size of the workspace that is passed to
- *           ZSTD_initCCtx().
+ * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+ * setting of ZSTD_c_blockDelimiters as ZSTD_sf_noBlockDelimiters
+ * @return : number of sequences left after merging
+ */
+ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
+
+/*! ZSTD_compressSequences() :
+ * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
+ * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
+ * The entire source is compressed into a single frame.
+ *
+ * The compression behavior changes based on cctx params. In particular:
+ *    If ZSTD_c_blockDelimiters == ZSTD_sf_noBlockDelimiters, the array of ZSTD_Sequence is expected to contain
+ *    no block delimiters (defined in ZSTD_Sequence). Block boundaries are roughly determined based on
+ *    the block size derived from the cctx, and sequences may be split. This is the default setting.
+ *
+ *    If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain
+ *    block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided.
+ *
+ *    If ZSTD_c_validateSequences == 0, this function will blindly accept the sequences provided. Invalid sequences cause undefined
+ *    behavior. If ZSTD_c_validateSequences == 1, then if sequence is invalid (see doc/zstd_compression_format.md for
+ *    specifics regarding offset/matchlength requirements) then the function will bail out and return an error.
+ *
+ *    In addition to the two adjustable experimental params, there are other important cctx params.
+ *    - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN.
+ *    - ZSTD_c_compressionLevel accordingly adjusts the strength of the entropy coder, as it would in typical compression.
+ *    - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset
+ *      is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md
+ *
+ * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
+ * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
+ *         and cannot emit an RLE block that disagrees with the repcode history
+ * @return : final compressed size or a ZSTD error.
  */
-size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
+                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                                  const void* src, size_t srcSize);
 
-/**
- * struct ZSTD_CCtx - the zstd compression context
+
+/*! ZSTD_writeSkippableFrame() :
+ * Generates a zstd skippable frame containing data given by src, and writes it to dst buffer.
  *
- * When compressing many times it is recommended to allocate a context just once
- * and reuse it for each successive compression operation.
- */
-typedef struct ZSTD_CCtx_s ZSTD_CCtx;
-/**
- * ZSTD_initCCtx() - initialize a zstd compression context
- * @workspace:     The workspace to emplace the context into. It must outlive
- *                 the returned context.
- * @workspaceSize: The size of workspace. Use ZSTD_CCtxWorkspaceBound() to
- *                 determine how large the workspace must be.
+ * Skippable frames begin with a a 4-byte magic number. There are 16 possible choices of magic number,
+ * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15.
+ * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so
+ * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant.
  *
- * Return:         A compression context emplaced into workspace.
- */
-ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize);
-
-/**
- * ZSTD_compressCCtx() - compress src into dst
- * @ctx:         The context. Must have been initialized with a workspace at
- *               least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
- * @dst:         The buffer to compress src into.
- * @dstCapacity: The size of the destination buffer. May be any size, but
- *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
- * @src:         The data to compress.
- * @srcSize:     The size of the data to compress.
- * @params:      The parameters to use for compression. See ZSTD_getParams().
+ * Returns an error if destination buffer is not large enough, if the source size is not representable
+ * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid).
  *
- * Return:       The compressed size or an error, which can be checked using
- *               ZSTD_isError().
+ * @return : number of bytes written or a ZSTD error.
  */
-size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize, ZSTD_parameters params);
-
-/**
- * ZSTD_DCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_DCtx
- *
- * Return: A lower bound on the size of the workspace that is passed to
- *         ZSTD_initDCtx().
+ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity,
+                                            const void* src, size_t srcSize, unsigned magicVariant);
+
+
+/* *************************************
+*  Memory management
+***************************************/
+
+/*! ZSTD_estimate*() :
+ *  These functions make it possible to estimate memory usage
+ *  of a future {D,C}Ctx, before its creation.
+ *
+ *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
+ *  for any compression level up to selected one.
+ *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+ *         does not include space for a window buffer.
+ *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
+ *  The estimate will assume the input may be arbitrarily large,
+ *  which is the worst case.
+ *
+ *  When srcSize can be bound by a known and rather "small" value,
+ *  this fact can be used to provide a tighter estimation
+ *  because the CCtx compression context will need less memory.
+ *  This tighter estimation can be provided by more advanced functions
+ *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+ *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+ *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+ *
+ *  Note 2 : only single-threaded compression is supported.
+ *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
+
+/*! ZSTD_estimateCStreamSize() :
+ *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+ *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+ *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *  Note : CStream size estimation is only correct for single-threaded compression.
+ *  ZSTD_DStream memory budget depends on window Size.
+ *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+ *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+ *         an internal ?Dict will be created, which additional size is not estimated here.
+ *         In this case, get total size by adding ZSTD_estimate?DictSize */
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+
+/*! ZSTD_estimate?DictSize() :
+ *  ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
+ *  ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced().
+ *  Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
+ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod);
+
+/*! ZSTD_initStatic*() :
+ *  Initialize an object using a pre-allocated fixed-size buffer.
+ *  workspace: The memory area to emplace the object into.
+ *             Provided pointer *must be 8-bytes aligned*.
+ *             Buffer must outlive object.
+ *  workspaceSize: Use ZSTD_estimate*Size() to determine
+ *                 how large workspace must be to support target scenario.
+ * @return : pointer to object (same address as workspace, just different type),
+ *           or NULL if error (size too small, incorrect alignment, etc.)
+ *  Note : zstd will never resize nor malloc() when using a static buffer.
+ *         If the object requires more memory than available,
+ *         zstd will just error out (typically ZSTD_error_memory_allocation).
+ *  Note 2 : there is no corresponding "free" function.
+ *           Since workspace is allocated externally, it must be freed externally too.
+ *  Note 3 : cParams : use ZSTD_getCParams() to convert a compression level
+ *           into its associated cParams.
+ *  Limitation 1 : currently not compatible with internal dictionary creation, triggered by
+ *                 ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict().
+ *  Limitation 2 : static cctx currently not compatible with multi-threading.
+ *  Limitation 3 : static dctx is incompatible with legacy support.
  */
-size_t ZSTD_DCtxWorkspaceBound(void);
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    /*< same as ZSTD_initStaticCCtx() */
+
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    /*< same as ZSTD_initStaticDCtx() */
+
+ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType,
+                                        ZSTD_compressionParameters cParams);
+
+ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType);
+
+
+/*! Custom memory allocation :
+ *  These prototypes make it possible to pass your own allocation/free functions.
+ *  ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
+ *  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ */
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+static
+__attribute__((__unused__))
+ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  /*< this constant defers to stdlib's functions */
+
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_compressionParameters cParams,
+                                                  ZSTD_customMem customMem);
+
+/* ! Thread pool :
+ * These prototypes make it possible to share a thread pool among multiple compression contexts.
+ * This can limit resources for applications with multiple threads where each one uses
+ * a threaded compression mode (via ZSTD_c_nbWorkers parameter).
+ * ZSTD_createThreadPool creates a new thread pool with a given number of threads.
+ * Note that the lifetime of such pool must exist while being used.
+ * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL argument value
+ * to use an internal thread pool).
+ * ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer.
+ */
+typedef struct POOL_ctx_s ZSTD_threadPool;
+ZSTDLIB_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads);
+ZSTDLIB_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool);  /* accept NULL pointer */
+ZSTDLIB_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool);
 
-/**
- * struct ZSTD_DCtx - the zstd decompression context
- *
- * When decompressing many times it is recommended to allocate a context just
- * once and reuse it for each successive decompression operation.
+
+/*
+ * This API is temporary and is expected to change or disappear in the future!
  */
-typedef struct ZSTD_DCtx_s ZSTD_DCtx;
-/**
- * ZSTD_initDCtx() - initialize a zstd decompression context
- * @workspace:     The workspace to emplace the context into. It must outlive
- *                 the returned context.
- * @workspaceSize: The size of workspace. Use ZSTD_DCtxWorkspaceBound() to
- *                 determine how large the workspace must be.
- *
- * Return:         A decompression context emplaced into workspace.
- */
-ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize);
-
-/**
- * ZSTD_decompressDCtx() - decompress zstd compressed src into dst
- * @ctx:         The decompression context.
- * @dst:         The buffer to decompress src into.
- * @dstCapacity: The size of the destination buffer. Must be at least as large
- *               as the decompressed size. If the caller cannot upper bound the
- *               decompressed size, then it's better to use the streaming API.
- * @src:         The zstd compressed data to decompress. Multiple concatenated
- *               frames and skippable frames are allowed.
- * @srcSize:     The exact size of the data to decompress.
- *
- * Return:       The decompressed size or an error, which can be checked using
- *               ZSTD_isError().
- */
-size_t ZSTD_decompressDCtx(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-
-/*-************************
- * Simple dictionary API
- **************************/
-
-/**
- * ZSTD_compress_usingDict() - compress src into dst using a dictionary
- * @ctx:         The context. Must have been initialized with a workspace at
- *               least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
- * @dst:         The buffer to compress src into.
- * @dstCapacity: The size of the destination buffer. May be any size, but
- *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
- * @src:         The data to compress.
- * @srcSize:     The size of the data to compress.
- * @dict:        The dictionary to use for compression.
- * @dictSize:    The size of the dictionary.
- * @params:      The parameters to use for compression. See ZSTD_getParams().
- *
- * Compression using a predefined dictionary. The same dictionary must be used
- * during decompression.
- *
- * Return:       The compressed size or an error, which can be checked using
- *               ZSTD_isError().
- */
-size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize, const void *dict, size_t dictSize,
-	ZSTD_parameters params);
-
-/**
- * ZSTD_decompress_usingDict() - decompress src into dst using a dictionary
- * @ctx:         The decompression context.
- * @dst:         The buffer to decompress src into.
- * @dstCapacity: The size of the destination buffer. Must be at least as large
- *               as the decompressed size. If the caller cannot upper bound the
- *               decompressed size, then it's better to use the streaming API.
- * @src:         The zstd compressed data to decompress. Multiple concatenated
- *               frames and skippable frames are allowed.
- * @srcSize:     The exact size of the data to decompress.
- * @dict:        The dictionary to use for decompression. The same dictionary
- *               must've been used to compress the data.
- * @dictSize:    The size of the dictionary.
- *
- * Return:       The decompressed size or an error, which can be checked using
- *               ZSTD_isError().
- */
-size_t ZSTD_decompress_usingDict(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize, const void *dict, size_t dictSize);
-
-/*-**************************
- * Fast dictionary API
- ***************************/
-
-/**
- * ZSTD_CDictWorkspaceBound() - memory needed to initialize a ZSTD_CDict
- * @cParams: The compression parameters to be used for compression.
- *
- * Return:   A lower bound on the size of the workspace that is passed to
- *           ZSTD_initCDict().
- */
-size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams);
-
-/**
- * struct ZSTD_CDict - a digested dictionary to be used for compression
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2(
+    const void* dict, size_t dictSize,
+    ZSTD_dictLoadMethod_e dictLoadMethod,
+    ZSTD_dictContentType_e dictContentType,
+    const ZSTD_CCtx_params* cctxParams,
+    ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(
+    const void* dict, size_t dictSize,
+    ZSTD_dictLoadMethod_e dictLoadMethod,
+    ZSTD_dictContentType_e dictContentType,
+    ZSTD_customMem customMem);
+
+
+/* *************************************
+*  Advanced compression functions
+***************************************/
+
+/*! ZSTD_createCDict_byReference() :
+ *  Create a digested dictionary for compression
+ *  Dictionary content is just referenced, not duplicated.
+ *  As a consequence, `dictBuffer` **must** outlive CDict,
+ *  and its content must remain unmodified throughout the lifetime of CDict.
+ *  note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
+
+/*! ZSTD_getDictID_fromCDict() :
+ *  Provides the dictID of the dictionary loaded into `cdict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict);
+
+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
+ * `estimatedSrcSize` value is optional, select 0 if not known */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_getParams() :
+ *  same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`.
+ *  All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */
+ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_checkCParams() :
+ *  Ensure param values remain within authorized range.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+
+/*! ZSTD_adjustCParams() :
+ *  optimize params for a given `srcSize` and `dictSize`.
+ * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN.
+ * `dictSize` must be `0` when there is no dictionary.
+ *  cPar can be invalid : all parameters will be clamped within valid range in the @return struct.
+ *  This function never fails (wide contract) */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+
+/*! ZSTD_compress_advanced() :
+ *  Note : this function is now DEPRECATED.
+ *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+ *  This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */
+ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+                                          void* dst, size_t dstCapacity,
+                                    const void* src, size_t srcSize,
+                                    const void* dict,size_t dictSize,
+                                          ZSTD_parameters params);
+
+/*! ZSTD_compress_usingCDict_advanced() :
+ *  Note : this function is now REDUNDANT.
+ *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+ *  This prototype will be marked as deprecated and generate compilation warning in some future version */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_CDict* cdict,
+                                              ZSTD_frameParameters fParams);
+
+
+/*! ZSTD_CCtx_loadDictionary_byReference() :
+ *  Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx.
+ *  It saves some memory, but also requires that `dict` outlives its usage within `cctx` */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_loadDictionary_advanced() :
+ *  Same as ZSTD_CCtx_loadDictionary(), but gives finer control over
+ *  how to load the dictionary (by copy ? by reference ?)
+ *  and how to interpret it (automatic ? force raw mode ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_CCtx_refPrefix_advanced() :
+ *  Same as ZSTD_CCtx_refPrefix(), but gives finer control over
+ *  how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/* ===   experimental parameters   === */
+/* these parameters can be used with ZSTD_setParameter()
+ * they are not guaranteed to remain supported in the future */
+
+ /* Enables rsyncable mode,
+  * which makes compressed files more rsync friendly
+  * by adding periodic synchronization points to the compressed data.
+  * The target average block size is ZSTD_c_jobSize / 2.
+  * It's possible to modify the job size to increase or decrease
+  * the granularity of the synchronization point.
+  * Once the jobSize is smaller than the window size,
+  * it will result in compression ratio degradation.
+  * NOTE 1: rsyncable mode only works when multithreading is enabled.
+  * NOTE 2: rsyncable performs poorly in combination with long range mode,
+  * since it will decrease the effectiveness of synchronization points,
+  * though mileage may vary.
+  * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s.
+  * If the selected compression level is already running significantly slower,
+  * the overall speed won't be significantly impacted.
+  */
+ #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1
+
+/* Select a compression format.
+ * The value must be of type ZSTD_format_e.
+ * See ZSTD_format_e enum definition for details */
+#define ZSTD_c_format ZSTD_c_experimentalParam2
+
+/* Force back-reference distances to remain < windowSize,
+ * even when referencing into Dictionary content (default:0) */
+#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3
+
+/* Controls whether the contents of a CDict
+ * are used in place, or copied into the working context.
+ * Accepts values from the ZSTD_dictAttachPref_e enum.
+ * See the comments on that enum for an explanation of the feature. */
+#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4
+
+/* Controls how the literals are compressed (default is auto).
+ * The value must be of type ZSTD_literalCompressionMode_e.
+ * See ZSTD_literalCompressionMode_t enum definition for details.
  */
-typedef struct ZSTD_CDict_s ZSTD_CDict;
+#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+
+/* Tries to fit compressed block size to be around targetCBlockSize.
+ * No target when targetCBlockSize == 0.
+ * There is no guarantee on compressed block size (default:0) */
+#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+
+/* User's best guess of source size.
+ * Hint is not valid when srcSizeHint == 0.
+ * There is no guarantee that hint is close to actual source size,
+ * but compression ratio may regress significantly if guess considerably underestimates */
+#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7
+
+/* Controls whether the new and experimental "dedicated dictionary search
+ * structure" can be used. This feature is still rough around the edges, be
+ * prepared for surprising behavior!
+ *
+ * How to use it:
+ *
+ * When using a CDict, whether to use this feature or not is controlled at
+ * CDict creation, and it must be set in a CCtxParams set passed into that
+ * construction (via ZSTD_createCDict_advanced2()). A compression will then
+ * use the feature or not based on how the CDict was constructed; the value of
+ * this param, set in the CCtx, will have no effect.
+ *
+ * However, when a dictionary buffer is passed into a CCtx, such as via
+ * ZSTD_CCtx_loadDictionary(), this param can be set on the CCtx to control
+ * whether the CDict that is created internally can use the feature or not.
+ *
+ * What it does:
+ *
+ * Normally, the internal data structures of the CDict are analogous to what
+ * would be stored in a CCtx after compressing the contents of a dictionary.
+ * To an approximation, a compression using a dictionary can then use those
+ * data structures to simply continue what is effectively a streaming
+ * compression where the simulated compression of the dictionary left off.
+ * Which is to say, the search structures in the CDict are normally the same
+ * format as in the CCtx.
+ *
+ * It is possible to do better, since the CDict is not like a CCtx: the search
+ * structures are written once during CDict creation, and then are only read
+ * after that, while the search structures in the CCtx are both read and
+ * written as the compression goes along. This means we can choose a search
+ * structure for the dictionary that is read-optimized.
+ *
+ * This feature enables the use of that different structure.
+ *
+ * Note that some of the members of the ZSTD_compressionParameters struct have
+ * different semantics and constraints in the dedicated search structure. It is
+ * highly recommended that you simply set a compression level in the CCtxParams
+ * you pass into the CDict creation call, and avoid messing with the cParams
+ * directly.
+ *
+ * Effects:
+ *
+ * This will only have any effect when the selected ZSTD_strategy
+ * implementation supports this feature. Currently, that's limited to
+ * ZSTD_greedy, ZSTD_lazy, and ZSTD_lazy2.
+ *
+ * Note that this means that the CDict tables can no longer be copied into the
+ * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be
+ * useable. The dictionary can only be attached or reloaded.
+ *
+ * In general, you should expect compression to be faster--sometimes very much
+ * so--and CDict creation to be slightly slower. Eventually, we will probably
+ * make this mode the default.
+ */
+#define ZSTD_c_enableDedicatedDictSearch ZSTD_c_experimentalParam8
+
+/* ZSTD_c_stableInBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
+ * between calls, except for the modifications that zstd makes to pos (the
+ * caller must not modify pos). This is checked by the compressor, and
+ * compression will fail if it ever changes. This means the only flush
+ * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
+ * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
+ * MUST not be modified during compression or you will get data corruption.
+ *
+ * When this flag is enabled zstd won't allocate an input window buffer,
+ * because the user guarantees it can reference the ZSTD_inBuffer until
+ * the frame is complete. But, it will still allocate an output buffer
+ * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
+ * avoid the memcpy() from the input buffer to the input window buffer.
+ *
+ * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
+ * That means this flag cannot be used with ZSTD_compressStream().
+ *
+ * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
+ * this flag is ALWAYS memory safe, and will never access out-of-bounds
+ * memory. However, compression WILL fail if you violate the preconditions.
+ *
+ * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
+ * not be modified during compression or you will get data corruption. This
+ * is because zstd needs to reference data in the ZSTD_inBuffer to find
+ * matches. Normally zstd maintains its own window buffer for this purpose,
+ * but passing this flag tells zstd to use the user provided buffer.
+ */
+#define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
 
-/**
- * ZSTD_initCDict() - initialize a digested dictionary for compression
- * @dictBuffer:    The dictionary to digest. The buffer is referenced by the
- *                 ZSTD_CDict so it must outlive the returned ZSTD_CDict.
- * @dictSize:      The size of the dictionary.
- * @params:        The parameters to use for compression. See ZSTD_getParams().
- * @workspace:     The workspace. It must outlive the returned ZSTD_CDict.
- * @workspaceSize: The workspace size. Must be at least
- *                 ZSTD_CDictWorkspaceBound(params.cParams).
+/* ZSTD_c_stableOutBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
  *
- * When compressing multiple messages / blocks with the same dictionary it is
- * recommended to load it just once. The ZSTD_CDict merely references the
- * dictBuffer, so it must outlive the returned ZSTD_CDict.
+ * Tells he compressor that the ZSTD_outBuffer will not be resized between
+ * calls. Specifically: (out.size - out.pos) will never grow. This gives the
+ * compressor the freedom to say: If the compressed data doesn't fit in the
+ * output buffer then return ZSTD_error_dstSizeTooSmall. This allows us to
+ * always decompress directly into the output buffer, instead of decompressing
+ * into an internal buffer and copying to the output buffer.
  *
- * Return:         The digested dictionary emplaced into workspace.
+ * When this flag is enabled zstd won't allocate an output buffer, because
+ * it can write directly to the ZSTD_outBuffer. It will still allocate the
+ * input window buffer (see ZSTD_c_stableInBuffer).
+ *
+ * Zstd will check that (out.size - out.pos) never grows and return an error
+ * if it does. While not strictly necessary, this should prevent surprises.
  */
-ZSTD_CDict *ZSTD_initCDict(const void *dictBuffer, size_t dictSize,
-	ZSTD_parameters params, void *workspace, size_t workspaceSize);
+#define ZSTD_c_stableOutBuffer ZSTD_c_experimentalParam10
 
-/**
- * ZSTD_compress_usingCDict() - compress src into dst using a ZSTD_CDict
- * @ctx:         The context. Must have been initialized with a workspace at
- *               least as large as ZSTD_CCtxWorkspaceBound(cParams) where
- *               cParams are the compression parameters used to initialize the
- *               cdict.
- * @dst:         The buffer to compress src into.
- * @dstCapacity: The size of the destination buffer. May be any size, but
- *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
- * @src:         The data to compress.
- * @srcSize:     The size of the data to compress.
- * @cdict:       The digested dictionary to use for compression.
- * @params:      The parameters to use for compression. See ZSTD_getParams().
+/* ZSTD_c_blockDelimiters
+ * Default is 0 == ZSTD_sf_noBlockDelimiters.
  *
- * Compression using a digested dictionary. The same dictionary must be used
- * during decompression.
+ * For use with sequence compression API: ZSTD_compressSequences().
  *
- * Return:       The compressed size or an error, which can be checked using
- *               ZSTD_isError().
+ * Designates whether or not the given array of ZSTD_Sequence contains block delimiters
+ * and last literals, which are defined as sequences with offset == 0 and matchLength == 0.
+ * See the definition of ZSTD_Sequence for more specifics.
  */
-size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize, const ZSTD_CDict *cdict);
-
+#define ZSTD_c_blockDelimiters ZSTD_c_experimentalParam11
 
-/**
- * ZSTD_DDictWorkspaceBound() - memory needed to initialize a ZSTD_DDict
+/* ZSTD_c_validateSequences
+ * Default is 0 == disabled. Set to 1 to enable sequence validation.
+ *
+ * For use with sequence compression API: ZSTD_compressSequences().
+ * Designates whether or not we validate sequences provided to ZSTD_compressSequences()
+ * during function execution.
+ *
+ * Without validation, providing a sequence that does not conform to the zstd spec will cause
+ * undefined behavior, and may produce a corrupted block.
+ *
+ * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
+ * specifics regarding offset/matchlength requirements) then the function will bail out and
+ * return an error.
  *
- * Return:  A lower bound on the size of the workspace that is passed to
- *          ZSTD_initDDict().
  */
-size_t ZSTD_DDictWorkspaceBound(void);
+#define ZSTD_c_validateSequences ZSTD_c_experimentalParam12
 
-/**
- * struct ZSTD_DDict - a digested dictionary to be used for decompression
+/*! ZSTD_CCtx_getParameter() :
+ *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+ *  and store it into int* value.
+ * @return : 0, or an error code (which can be tested with ZSTD_isError()).
  */
-typedef struct ZSTD_DDict_s ZSTD_DDict;
-
-/**
- * ZSTD_initDDict() - initialize a digested dictionary for decompression
- * @dictBuffer:    The dictionary to digest. The buffer is referenced by the
- *                 ZSTD_DDict so it must outlive the returned ZSTD_DDict.
- * @dictSize:      The size of the dictionary.
- * @workspace:     The workspace. It must outlive the returned ZSTD_DDict.
- * @workspaceSize: The workspace size. Must be at least
- *                 ZSTD_DDictWorkspaceBound().
- *
- * When decompressing multiple messages / blocks with the same dictionary it is
- * recommended to load it just once. The ZSTD_DDict merely references the
- * dictBuffer, so it must outlive the returned ZSTD_DDict.
- *
- * Return:         The digested dictionary emplaced into workspace.
- */
-ZSTD_DDict *ZSTD_initDDict(const void *dictBuffer, size_t dictSize,
-	void *workspace, size_t workspaceSize);
-
-/**
- * ZSTD_decompress_usingDDict() - decompress src into dst using a ZSTD_DDict
- * @ctx:         The decompression context.
- * @dst:         The buffer to decompress src into.
- * @dstCapacity: The size of the destination buffer. Must be at least as large
- *               as the decompressed size. If the caller cannot upper bound the
- *               decompressed size, then it's better to use the streaming API.
- * @src:         The zstd compressed data to decompress. Multiple concatenated
- *               frames and skippable frames are allowed.
- * @srcSize:     The exact size of the data to decompress.
- * @ddict:       The digested dictionary to use for decompression. The same
- *               dictionary must've been used to compress the data.
- *
- * Return:       The decompressed size or an error, which can be checked using
- *               ZSTD_isError().
- */
-size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst,
-	size_t dstCapacity, const void *src, size_t srcSize,
-	const ZSTD_DDict *ddict);
-
-
-/*-**************************
- * Streaming
- ***************************/
-
-/**
- * struct ZSTD_inBuffer - input buffer for streaming
- * @src:  Start of the input buffer.
- * @size: Size of the input buffer.
- * @pos:  Position where reading stopped. Will be updated.
- *        Necessarily 0 <= pos <= size.
+ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value);
+
+
+/*! ZSTD_CCtx_params :
+ *  Quick howto :
+ *  - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure
+ *  - ZSTD_CCtxParams_setParameter() : Push parameters one by one into
+ *                                     an existing ZSTD_CCtx_params structure.
+ *                                     This is similar to
+ *                                     ZSTD_CCtx_setParameter().
+ *  - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to
+ *                                    an existing CCtx.
+ *                                    These parameters will be applied to
+ *                                    all subsequent frames.
+ *  - ZSTD_compressStream2() : Do compression using the CCtx.
+ *  - ZSTD_freeCCtxParams() : Free the memory, accept NULL pointer.
+ *
+ *  This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams()
+ *  for static allocation of CCtx for single-threaded compression.
  */
-typedef struct ZSTD_inBuffer_s {
-	const void *src;
-	size_t size;
-	size_t pos;
-} ZSTD_inBuffer;
+ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);  /* accept NULL pointer */
 
-/**
- * struct ZSTD_outBuffer - output buffer for streaming
- * @dst:  Start of the output buffer.
- * @size: Size of the output buffer.
- * @pos:  Position where writing stopped. Will be updated.
- *        Necessarily 0 <= pos <= size.
+/*! ZSTD_CCtxParams_reset() :
+ *  Reset params to default values.
  */
-typedef struct ZSTD_outBuffer_s {
-	void *dst;
-	size_t size;
-	size_t pos;
-} ZSTD_outBuffer;
-
-
+ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
 
-/*-*****************************************************************************
- * Streaming compression - HowTo
- *
- * A ZSTD_CStream object is required to track streaming operation.
- * Use ZSTD_initCStream() to initialize a ZSTD_CStream object.
- * ZSTD_CStream objects can be reused multiple times on consecutive compression
- * operations. It is recommended to re-use ZSTD_CStream in situations where many
- * streaming operations will be achieved consecutively. Use one separate
- * ZSTD_CStream per thread for parallel execution.
- *
- * Use ZSTD_compressStream() repetitively to consume input stream.
- * The function will automatically update both `pos` fields.
- * Note that it may not consume the entire input, in which case `pos < size`,
- * and it's up to the caller to present again remaining data.
- * It returns a hint for the preferred number of bytes to use as an input for
- * the next function call.
- *
- * At any moment, it's possible to flush whatever data remains within internal
- * buffer, using ZSTD_flushStream(). `output->pos` will be updated. There might
- * still be some content left within the internal buffer if `output->size` is
- * too small. It returns the number of bytes left in the internal buffer and
- * must be called until it returns 0.
- *
- * ZSTD_endStream() instructs to finish a frame. It will perform a flush and
- * write frame epilogue. The epilogue is required for decoders to consider a
- * frame completed. Similar to ZSTD_flushStream(), it may not be able to flush
- * the full content if `output->size` is too small. In which case, call again
- * ZSTD_endStream() to complete the flush. It returns the number of bytes left
- * in the internal buffer and must be called until it returns 0.
- ******************************************************************************/
+/*! ZSTD_CCtxParams_init() :
+ *  Initializes the compression parameters of cctxParams according to
+ *  compression level. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel);
 
-/**
- * ZSTD_CStreamWorkspaceBound() - memory needed to initialize a ZSTD_CStream
- * @cParams: The compression parameters to be used for compression.
- *
- * Return:   A lower bound on the size of the workspace that is passed to
- *           ZSTD_initCStream() and ZSTD_initCStream_usingCDict().
+/*! ZSTD_CCtxParams_init_advanced() :
+ *  Initializes the compression and frame parameters of cctxParams according to
+ *  params. All other parameters are reset to their default values.
  */
-size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
+
+/*! ZSTD_CCtxParams_setParameter() :
+ *  Similar to ZSTD_CCtx_setParameter.
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  Parameters must be applied to a ZSTD_CCtx using
+ *  ZSTD_CCtx_setParametersUsingCCtxParams().
+ * @result : a code representing success or failure (which can be tested with
+ *           ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value);
 
-/**
- * struct ZSTD_CStream - the zstd streaming compression context
+/*! ZSTD_CCtxParams_getParameter() :
+ * Similar to ZSTD_CCtx_getParameter.
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value);
+
+/*! ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  Apply a set of ZSTD_CCtx_params to the compression context.
+ *  This can be done even after compression is started,
+ *    if nbWorkers==0, this will have no impact until a new compression is started.
+ *    if nbWorkers>=1, new parameters will be picked up at next job,
+ *       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
+
+/*! ZSTD_compressStream2_simpleArgs() :
+ *  Same as ZSTD_compressStream2(),
+ *  but using only integral types as arguments.
+ *  This variant might be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
  */
-typedef struct ZSTD_CStream_s ZSTD_CStream;
+ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp);
+
+
+/* *************************************
+*  Advanced decompression functions
+***************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  Dictionary content is referenced, and therefore stays in dictBuffer.
+ *  It is important that dictBuffer outlives DDict,
+ *  it must remain read accessible throughout the lifetime of DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_byReference() :
+ *  Same as ZSTD_DCtx_loadDictionary(),
+ *  but references `dict` content instead of copying it into `dctx`.
+ *  This saves memory if `dict` remains around.,
+ *  However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_advanced() :
+ *  Same as ZSTD_DCtx_loadDictionary(),
+ *  but gives direct control over
+ *  how to load the dictionary (by copy ? by reference ?)
+ *  and how to interpret it (automatic ? force raw mode ? full mode only ?). */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_refPrefix_advanced() :
+ *  Same as ZSTD_DCtx_refPrefix(), but gives finer control over
+ *  how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_setMaxWindowSize() :
+ *  Refuses allocating internal buffers for frames requiring a window size larger than provided limit.
+ *  This protects a decoder context from reserving too much memory for itself (potential attack scenario).
+ *  This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+ *  By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT)
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize);
 
-/*===== ZSTD_CStream management functions =====*/
-/**
- * ZSTD_initCStream() - initialize a zstd streaming compression context
- * @params:         The zstd compression parameters.
- * @pledgedSrcSize: If params.fParams.contentSizeFlag == 1 then the caller must
- *                  pass the source size (zero means empty source). Otherwise,
- *                  the caller may optionally pass the source size, or zero if
- *                  unknown.
- * @workspace:      The workspace to emplace the context into. It must outlive
- *                  the returned context.
- * @workspaceSize:  The size of workspace.
- *                  Use ZSTD_CStreamWorkspaceBound(params.cParams) to determine
- *                  how large the workspace must be.
- *
- * Return:          The zstd streaming compression context.
- */
-ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params,
-	unsigned long long pledgedSrcSize, void *workspace,
-	size_t workspaceSize);
-
-/**
- * ZSTD_initCStream_usingCDict() - initialize a streaming compression context
- * @cdict:          The digested dictionary to use for compression.
- * @pledgedSrcSize: Optionally the source size, or zero if unknown.
- * @workspace:      The workspace to emplace the context into. It must outlive
- *                  the returned context.
- * @workspaceSize:  The size of workspace. Call ZSTD_CStreamWorkspaceBound()
- *                  with the cParams used to initialize the cdict to determine
- *                  how large the workspace must be.
- *
- * Return:          The zstd streaming compression context.
- */
-ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict,
-	unsigned long long pledgedSrcSize, void *workspace,
-	size_t workspaceSize);
+/*! ZSTD_DCtx_getParameter() :
+ *  Get the requested decompression parameter value, selected by enum ZSTD_dParameter,
+ *  and store it into int* value.
+ * @return : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value);
 
-/*===== Streaming compression functions =====*/
-/**
- * ZSTD_resetCStream() - reset the context using parameters from creation
- * @zcs:            The zstd streaming compression context to reset.
- * @pledgedSrcSize: Optionally the source size, or zero if unknown.
- *
- * Resets the context using the parameters from creation. Skips dictionary
- * loading, since it can be reused. If `pledgedSrcSize` is non-zero the frame
- * content size is always written into the frame header.
- *
- * Return:          Zero or an error, which can be checked using ZSTD_isError().
+/* ZSTD_d_format
+ * experimental parameter,
+ * allowing selection between ZSTD_format_e input compression formats
  */
-size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize);
-/**
- * ZSTD_compressStream() - streaming compress some of input into output
- * @zcs:    The zstd streaming compression context.
- * @output: Destination buffer. `output->pos` is updated to indicate how much
- *          compressed data was written.
- * @input:  Source buffer. `input->pos` is updated to indicate how much data was
- *          read. Note that it may not consume the entire input, in which case
- *          `input->pos < input->size`, and it's up to the caller to present
- *          remaining data again.
- *
- * The `input` and `output` buffers may be any size. Guaranteed to make some
- * forward progress if `input` and `output` are not empty.
- *
- * Return:  A hint for the number of bytes to use as the input for the next
- *          function call or an error, which can be checked using
- *          ZSTD_isError().
+#define ZSTD_d_format ZSTD_d_experimentalParam1
+/* ZSTD_d_stableOutBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same
+ * between calls, except for the modifications that zstd makes to pos (the
+ * caller must not modify pos). This is checked by the decompressor, and
+ * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer
+ * MUST be large enough to fit the entire decompressed frame. This will be
+ * checked when the frame content size is known. The data in the ZSTD_outBuffer
+ * in the range [dst, dst + pos) MUST not be modified during decompression
+ * or you will get data corruption.
+ *
+ * When this flags is enabled zstd won't allocate an output buffer, because
+ * it can write directly to the ZSTD_outBuffer, but it will still allocate
+ * an input buffer large enough to fit any compressed block. This will also
+ * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+ * If you need to avoid the input buffer allocation use the buffer-less
+ * streaming API.
+ *
+ * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using
+ * this flag is ALWAYS memory safe, and will never access out-of-bounds
+ * memory. However, decompression WILL fail if you violate the preconditions.
+ *
+ * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST
+ * not be modified during decompression or you will get data corruption. This
+ * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate
+ * matches. Normally zstd maintains its own buffer for this purpose, but passing
+ * this flag tells zstd to use the user provided buffer.
  */
-size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output,
-	ZSTD_inBuffer *input);
-/**
- * ZSTD_flushStream() - flush internal buffers into output
- * @zcs:    The zstd streaming compression context.
- * @output: Destination buffer. `output->pos` is updated to indicate how much
- *          compressed data was written.
- *
- * ZSTD_flushStream() must be called until it returns 0, meaning all the data
- * has been flushed. Since ZSTD_flushStream() causes a block to be ended,
- * calling it too often will degrade the compression ratio.
+#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2
+
+/* ZSTD_d_forceIgnoreChecksum
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable
  *
- * Return:  The number of bytes still present within internal buffers or an
- *          error, which can be checked using ZSTD_isError().
+ * Tells the decompressor to skip checksum validation during decompression, regardless
+ * of whether checksumming was specified during compression. This offers some
+ * slight performance benefits, and may be useful for debugging.
+ * Param has values of type ZSTD_forceIgnoreChecksum_e
  */
-size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
-/**
- * ZSTD_endStream() - flush internal buffers into output and end the frame
- * @zcs:    The zstd streaming compression context.
- * @output: Destination buffer. `output->pos` is updated to indicate how much
- *          compressed data was written.
+#define ZSTD_d_forceIgnoreChecksum ZSTD_d_experimentalParam3
+
+/* ZSTD_d_refMultipleDDicts
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable
  *
- * ZSTD_endStream() must be called until it returns 0, meaning all the data has
- * been flushed and the frame epilogue has been written.
+ * If enabled and dctx is allocated on the heap, then additional memory will be allocated
+ * to store references to multiple ZSTD_DDict. That is, multiple calls of ZSTD_refDDict()
+ * using a given ZSTD_DCtx, rather than overwriting the previous DDict reference, will instead
+ * store all references. At decompression time, the appropriate dictID is selected
+ * from the set of DDicts based on the dictID in the frame.
  *
- * Return:  The number of bytes still present within internal buffers or an
- *          error, which can be checked using ZSTD_isError().
- */
-size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
-
-/**
- * ZSTD_CStreamInSize() - recommended size for the input buffer
+ * Usage is simply calling ZSTD_refDDict() on multiple dict buffers.
  *
- * Return: The recommended size for the input buffer.
- */
-size_t ZSTD_CStreamInSize(void);
-/**
- * ZSTD_CStreamOutSize() - recommended size for the output buffer
+ * Param has values of byte ZSTD_refMultipleDDicts_e
  *
- * When the output buffer is at least this large, it is guaranteed to be large
- * enough to flush at least one complete compressed block.
+ * WARNING: Enabling this parameter and calling ZSTD_DCtx_refDDict(), will trigger memory
+ * allocation for the hash table. ZSTD_freeDCtx() also frees this memory.
+ * Memory is allocated as per ZSTD_DCtx::customMem.
  *
- * Return: The recommended size for the output buffer.
+ * Although this function allocates memory for the table, the user is still responsible for
+ * memory management of the underlying ZSTD_DDict* themselves.
  */
-size_t ZSTD_CStreamOutSize(void);
-
+#define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
 
 
-/*-*****************************************************************************
- * Streaming decompression - HowTo
- *
- * A ZSTD_DStream object is required to track streaming operations.
- * Use ZSTD_initDStream() to initialize a ZSTD_DStream object.
- * ZSTD_DStream objects can be re-used multiple times.
- *
- * Use ZSTD_decompressStream() repetitively to consume your input.
- * The function will update both `pos` fields.
- * If `input->pos < input->size`, some input has not been consumed.
- * It's up to the caller to present again remaining data.
- * If `output->pos < output->size`, decoder has flushed everything it could.
- * Returns 0 iff a frame is completely decoded and fully flushed.
- * Otherwise it returns a suggested next input size that will never load more
- * than the current frame.
- ******************************************************************************/
+/*! ZSTD_DCtx_setFormat() :
+ *  Instruct the decoder context about what kind of data to decode next.
+ *  This instruction is mandatory to decode data without a fully-formed header,
+ *  such ZSTD_f_zstd1_magicless for example.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
 
-/**
- * ZSTD_DStreamWorkspaceBound() - memory needed to initialize a ZSTD_DStream
- * @maxWindowSize: The maximum window size allowed for compressed frames.
- *
- * Return:         A lower bound on the size of the workspace that is passed to
- *                 ZSTD_initDStream() and ZSTD_initDStream_usingDDict().
+/*! ZSTD_decompressStream_simpleArgs() :
+ *  Same as ZSTD_decompressStream(),
+ *  but using only integral types as arguments.
+ *  This can be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
  */
-size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize);
+ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos);
+
+
+/* ******************************************************************
+*  Advanced streaming functions
+*  Warning : most of these functions are now redundant with the Advanced API.
+*  Once Advanced API reaches "stable" status,
+*  redundant functions will be deprecated, and then at some point removed.
+********************************************************************/
+
+/*=====   Advanced Streaming compression functions  =====*/
+
+/*! ZSTD_initCStream_srcSize() :
+ * This function is deprecated, and equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ * pledgedSrcSize must be correct. If it is not known at init time, use
+ * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs,
+ * "0" also disables frame content size field. It may be enabled in the future.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                         int compressionLevel,
+                         unsigned long long pledgedSrcSize);
+
+/*! ZSTD_initCStream_usingDict() :
+ * This function is deprecated, and is equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * Creates of an internal CDict (incompatible with static CCtx), except if
+ * dict == NULL or dictSize < 8, in which case no dict is used.
+ * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if
+ * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           int compressionLevel);
+
+/*! ZSTD_initCStream_advanced() :
+ * This function is deprecated, and is approximately equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+ *     for ((param, value) : params) {
+ *         ZSTD_CCtx_setParameter(zcs, param, value);
+ *     }
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy.
+ * pledgedSrcSize must be correct.
+ * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                    const void* dict, size_t dictSize,
+                          ZSTD_parameters params,
+                          unsigned long long pledgedSrcSize);
+
+/*! ZSTD_initCStream_usingCDict() :
+ * This function is deprecated, and equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * note : cdict will just be referenced, and must outlive compression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+
+/*! ZSTD_initCStream_usingCDict_advanced() :
+ *   This function is DEPRECATED, and is approximately equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+ *     for ((fParam, value) : fParams) {
+ *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+ *     }
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *     ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters.
+ * pledgedSrcSize must be correct. If srcSize is not known at init time, use
+ * value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                               const ZSTD_CDict* cdict,
+                                     ZSTD_frameParameters fParams,
+                                     unsigned long long pledgedSrcSize);
+
+/*! ZSTD_resetCStream() :
+ * This function is deprecated, and is equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ *  start a new frame, using same parameters from previous frame.
+ *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
+ *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+ *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+ *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+ *  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
+ *  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError())
+ *  Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
 
-/**
- * struct ZSTD_DStream - the zstd streaming decompression context
+
+typedef struct {
+    unsigned long long ingested;   /* nb input bytes read and buffered */
+    unsigned long long consumed;   /* nb input bytes actually compressed */
+    unsigned long long produced;   /* nb of compressed bytes generated and buffered */
+    unsigned long long flushed;    /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
+    unsigned currentJobID;         /* MT only : latest started job nb */
+    unsigned nbActiveWorkers;      /* MT only : nb of workers actively compressing at probe time */
+} ZSTD_frameProgression;
+
+/* ZSTD_getFrameProgression() :
+ * tells how much data has been ingested (read from input)
+ * consumed (input actually compressed) and produced (output) for current frame.
+ * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Aggregates progression inside active worker threads.
  */
-typedef struct ZSTD_DStream_s ZSTD_DStream;
-/*===== ZSTD_DStream management functions =====*/
-/**
- * ZSTD_initDStream() - initialize a zstd streaming decompression context
- * @maxWindowSize: The maximum window size allowed for compressed frames.
- * @workspace:     The workspace to emplace the context into. It must outlive
- *                 the returned context.
- * @workspaceSize: The size of workspace.
- *                 Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
- *                 how large the workspace must be.
- *
- * Return:         The zstd streaming decompression context.
- */
-ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace,
-	size_t workspaceSize);
-/**
- * ZSTD_initDStream_usingDDict() - initialize streaming decompression context
- * @maxWindowSize: The maximum window size allowed for compressed frames.
- * @ddict:         The digested dictionary to use for decompression.
- * @workspace:     The workspace to emplace the context into. It must outlive
- *                 the returned context.
- * @workspaceSize: The size of workspace.
- *                 Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
- *                 how large the workspace must be.
- *
- * Return:         The zstd streaming decompression context.
- */
-ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize,
-	const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+/*! ZSTD_toFlushNow() :
+ *  Tell how many bytes are ready to be flushed immediately.
+ *  Useful for multithreading scenarios (nbWorkers >= 1).
+ *  Probe the oldest active job, defined as oldest job not yet entirely flushed,
+ *  and check its output buffer.
+ * @return : amount of data stored in oldest job and ready to be flushed immediately.
+ *  if @return == 0, it means either :
+ *  + there is no active job (could be checked with ZSTD_frameProgression()), or
+ *  + oldest job is still actively compressing data,
+ *    but everything it has produced has also been flushed so far,
+ *    therefore flush speed is limited by production speed of oldest job
+ *    irrespective of the speed of concurrent (and newer) jobs.
+ */
+ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
 
-/*===== Streaming decompression functions =====*/
-/**
- * ZSTD_resetDStream() - reset the context using parameters from creation
- * @zds:   The zstd streaming decompression context to reset.
+
+/*=====   Advanced Streaming decompression functions  =====*/
+
+/*!
+ * This function is deprecated, and is equivalent to:
  *
- * Resets the context using the parameters from creation. Skips dictionary
- * loading, since it can be reused.
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
  *
- * Return: Zero or an error, which can be checked using ZSTD_isError().
+ * note: no dictionary will be used if dict == NULL or dictSize < 8
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
-size_t ZSTD_resetDStream(ZSTD_DStream *zds);
-/**
- * ZSTD_decompressStream() - streaming decompress some of input into output
- * @zds:    The zstd streaming decompression context.
- * @output: Destination buffer. `output.pos` is updated to indicate how much
- *          decompressed data was written.
- * @input:  Source buffer. `input.pos` is updated to indicate how much data was
- *          read. Note that it may not consume the entire input, in which case
- *          `input.pos < input.size`, and it's up to the caller to present
- *          remaining data again.
+ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+
+/*!
+ * This function is deprecated, and is equivalent to:
  *
- * The `input` and `output` buffers may be any size. Guaranteed to make some
- * forward progress if `input` and `output` are not empty.
- * ZSTD_decompressStream() will not consume the last byte of the frame until
- * the entire frame is flushed.
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_refDDict(zds, ddict);
  *
- * Return:  Returns 0 iff a frame is completely decoded and fully flushed.
- *          Otherwise returns a hint for the number of bytes to use as the input
- *          for the next function call or an error, which can be checked using
- *          ZSTD_isError(). The size hint will never load more than the frame.
+ * note : ddict is referenced, it must outlive decompression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
-size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output,
-	ZSTD_inBuffer *input);
+ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
 
-/**
- * ZSTD_DStreamInSize() - recommended size for the input buffer
- *
- * Return: The recommended size for the input buffer.
- */
-size_t ZSTD_DStreamInSize(void);
-/**
- * ZSTD_DStreamOutSize() - recommended size for the output buffer
+/*!
+ * This function is deprecated, and is equivalent to:
  *
- * When the output buffer is at least this large, it is guaranteed to be large
- * enough to flush at least one complete decompressed block.
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
  *
- * Return: The recommended size for the output buffer.
+ * re-use decompression parameters from previous init; saves dictionary loading
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
-size_t ZSTD_DStreamOutSize(void);
+ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 
 
-/* --- Constants ---*/
-#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
-#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
+/* *******************************************************************
+*  Buffer-less and synchronous inner streaming functions
+*
+*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+*  But it's also a complex one, with several restrictions, documented below.
+*  Prefer normal streaming API for an easier experience.
+********************************************************************* */
 
-#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
-#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+/*
+  Buffer-less streaming compression (synchronous mode)
+
+  A ZSTD_CCtx object is required to track streaming operations.
+  Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+
+  Start by initializing a context.
+  Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression,
+  or ZSTD_compressBegin_advanced(), for finer parameter control.
+  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+
+  Then, consume your input using ZSTD_compressContinue().
+  There are some important considerations to keep in mind when using this advanced function :
+  - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only.
+  - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks.
+  - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario.
+    Worst case evaluation is provided by ZSTD_compressBound().
+    ZSTD_compressContinue() doesn't guarantee recover after a failed compression.
+  - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog).
+    It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks)
+  - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps.
+    In which case, it will "discard" the relevant memory section from its history.
+
+  Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum.
+  It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+  Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+
+  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
+*/
 
-#define ZSTD_WINDOWLOG_MAX_32  27
-#define ZSTD_WINDOWLOG_MAX_64  27
-#define ZSTD_WINDOWLOG_MAX \
-	((unsigned int)(sizeof(size_t) == 4 \
-		? ZSTD_WINDOWLOG_MAX_32 \
-		: ZSTD_WINDOWLOG_MAX_64))
-#define ZSTD_WINDOWLOG_MIN 10
-#define ZSTD_HASHLOG_MAX ZSTD_WINDOWLOG_MAX
-#define ZSTD_HASHLOG_MIN        6
-#define ZSTD_CHAINLOG_MAX     (ZSTD_WINDOWLOG_MAX+1)
-#define ZSTD_CHAINLOG_MIN      ZSTD_HASHLOG_MIN
-#define ZSTD_HASHLOG3_MAX      17
-#define ZSTD_SEARCHLOG_MAX    (ZSTD_WINDOWLOG_MAX-1)
-#define ZSTD_SEARCHLOG_MIN      1
-/* only for ZSTD_fast, other strategies are limited to 6 */
-#define ZSTD_SEARCHLENGTH_MAX   7
-/* only for ZSTD_btopt, other strategies are limited to 4 */
-#define ZSTD_SEARCHLENGTH_MIN   3
-#define ZSTD_TARGETLENGTH_MIN   4
-#define ZSTD_TARGETLENGTH_MAX 999
-
-/* for static allocation */
-#define ZSTD_FRAMEHEADERSIZE_MAX 18
-#define ZSTD_FRAMEHEADERSIZE_MIN  6
-#define ZSTD_frameHeaderSize_prefix 5
-#define ZSTD_frameHeaderSize_min ZSTD_FRAMEHEADERSIZE_MIN
-#define ZSTD_frameHeaderSize_max ZSTD_FRAMEHEADERSIZE_MAX
-/* magic number + skippable frame length */
-#define ZSTD_skippableHeaderSize 8
-
-
-/*-*************************************
- * Compressed size functions
- **************************************/
-
-/**
- * ZSTD_findFrameCompressedSize() - returns the size of a compressed frame
- * @src:     Source buffer. It should point to the start of a zstd encoded frame
- *           or a skippable frame.
- * @srcSize: The size of the source buffer. It must be at least as large as the
- *           size of the frame.
- *
- * Return:   The compressed size of the frame pointed to by `src` or an error,
- *           which can be check with ZSTD_isError().
- *           Suitable to pass to ZSTD_decompress() or similar functions.
- */
-size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize);
-
-/*-*************************************
- * Decompressed size functions
- **************************************/
-/**
- * ZSTD_getFrameContentSize() - returns the content size in a zstd frame header
- * @src:     It should point to the start of a zstd encoded frame.
- * @srcSize: The size of the source buffer. It must be at least as large as the
- *           frame header. `ZSTD_frameHeaderSize_max` is always large enough.
- *
- * Return:   The frame content size stored in the frame header if known.
- *           `ZSTD_CONTENTSIZE_UNKNOWN` if the content size isn't stored in the
- *           frame header. `ZSTD_CONTENTSIZE_ERROR` on invalid input.
- */
-unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
-
-/**
- * ZSTD_findDecompressedSize() - returns decompressed size of a series of frames
- * @src:     It should point to the start of a series of zstd encoded and/or
- *           skippable frames.
- * @srcSize: The exact size of the series of frames.
- *
- * If any zstd encoded frame in the series doesn't have the frame content size
- * set, `ZSTD_CONTENTSIZE_UNKNOWN` is returned. But frame content size is always
- * set when using ZSTD_compress(). The decompressed size can be very large.
- * If the source is untrusted, the decompressed size could be wrong or
- * intentionally modified. Always ensure the result fits within the
- * application's authorized limits. ZSTD_findDecompressedSize() handles multiple
- * frames, and so it must traverse the input to read each frame header. This is
- * efficient as most of the data is skipped, however it does mean that all frame
- * data must be present and valid.
- *
- * Return:   Decompressed size of all the data contained in the frames if known.
- *           `ZSTD_CONTENTSIZE_UNKNOWN` if the decompressed size is unknown.
- *           `ZSTD_CONTENTSIZE_ERROR` if an error occurred.
- */
-unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize);
-
-/*-*************************************
- * Advanced compression functions
- **************************************/
-/**
- * ZSTD_checkCParams() - ensure parameter values remain within authorized range
- * @cParams: The zstd compression parameters.
- *
- * Return:   Zero or an error, which can be checked using ZSTD_isError().
- */
-size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams);
-
-/**
- * ZSTD_adjustCParams() - optimize parameters for a given srcSize and dictSize
- * @srcSize:  Optionally the estimated source size, or zero if unknown.
- * @dictSize: Optionally the estimated dictionary size, or zero if unknown.
- *
- * Return:    The optimized parameters.
- */
-ZSTD_compressionParameters ZSTD_adjustCParams(
-	ZSTD_compressionParameters cParams, unsigned long long srcSize,
-	size_t dictSize);
-
-/*--- Advanced decompression functions ---*/
-
-/**
- * ZSTD_isFrame() - returns true iff the buffer starts with a valid frame
- * @buffer: The source buffer to check.
- * @size:   The size of the source buffer, must be at least 4 bytes.
- *
- * Return: True iff the buffer starts with a zstd or skippable frame identifier.
- */
-unsigned int ZSTD_isFrame(const void *buffer, size_t size);
-
-/**
- * ZSTD_getDictID_fromDict() - returns the dictionary id stored in a dictionary
- * @dict:     The dictionary buffer.
- * @dictSize: The size of the dictionary buffer.
- *
- * Return:    The dictionary id stored within the dictionary or 0 if the
- *            dictionary is not a zstd dictionary. If it returns 0 the
- *            dictionary can still be loaded as a content-only dictionary.
- */
-unsigned int ZSTD_getDictID_fromDict(const void *dict, size_t dictSize);
+/*=====   Buffer-less streaming compression functions  =====*/
+ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
 
-/**
- * ZSTD_getDictID_fromDDict() - returns the dictionary id stored in a ZSTD_DDict
- * @ddict: The ddict to find the id of.
- *
- * Return: The dictionary id stored within `ddict` or 0 if the dictionary is not
- *         a zstd dictionary. If it returns 0 `ddict` will be loaded as a
- *         content-only dictionary.
- */
-unsigned int ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict);
+ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
-/**
- * ZSTD_getDictID_fromFrame() - returns the dictionary id stored in a zstd frame
- * @src:     Source buffer. It must be a zstd encoded frame.
- * @srcSize: The size of the source buffer. It must be at least as large as the
- *           frame header. `ZSTD_frameHeaderSize_max` is always large enough.
- *
- * Return:   The dictionary id required to decompress the frame stored within
- *           `src` or 0 if the dictionary id could not be decoded. It can return
- *           0 if the frame does not require a dictionary, the dictionary id
- *           wasn't stored in the frame, `src` is not a zstd frame, or `srcSize`
- *           is too small.
- */
-unsigned int ZSTD_getDictID_fromFrame(const void *src, size_t srcSize);
 
-/**
- * struct ZSTD_frameParams - zstd frame parameters stored in the frame header
- * @frameContentSize: The frame content size, or 0 if not present.
- * @windowSize:       The window size, or 0 if the frame is a skippable frame.
- * @dictID:           The dictionary id, or 0 if not present.
- * @checksumFlag:     Whether a checksum was used.
- */
+/*
+  Buffer-less streaming decompression (synchronous mode)
+
+  A ZSTD_DCtx object is required to track streaming operations.
+  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+  A ZSTD_DCtx object can be re-used multiple times.
+
+  First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+  Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+  Data fragment must be large enough to ensure successful decoding.
+ `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+           errorCode, which can be tested using ZSTD_isError().
+
+  It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+  such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`).
+  Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information.
+  As a consequence, check that values remain within valid application range.
+  For example, do not allocate memory blindly, check that `windowSize` is within expectation.
+  Each application can set its own limits, depending on local restrictions.
+  For extended interoperability, it is recommended to support `windowSize` of at least 8 MB.
+
+  ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes.
+  ZSTD_decompressContinue() is very sensitive to contiguity,
+  if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+  or that previous contiguous segment is large enough to properly handle maximum back-reference distance.
+  There are multiple ways to guarantee this condition.
+
+  The most memory efficient way is to use a round buffer of sufficient size.
+  Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+  which can @return an error code if required value is too large for current system (in 32-bits mode).
+  In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+  up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+  which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+  At which point, decoding can resume from the beginning of the buffer.
+  Note that already decoded data stored in the buffer should be flushed before being overwritten.
+
+  There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory.
+
+  Finally, if you control the compression process, you can also ignore all buffer size rules,
+  as long as the encoder and decoder progress in "lock-step",
+  aka use exactly the same buffer sizes, break contiguity at the same place, etc.
+
+  Once buffers are setup, start decompression, with ZSTD_decompressBegin().
+  If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict().
+
+  Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+
+ @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+  It can also be an error code, which can be tested with ZSTD_isError().
+
+  A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+
+  Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType().
+  This information is not required to properly decode a frame.
+
+  == Special case : skippable frames ==
+
+  Skippable frames allow integration of user-defined data into a flow of concatenated frames.
+  Skippable frames will be ignored (skipped) by decompressor.
+  The format of skippable frames is as follows :
+  a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+  b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+  c) Frame Content - any content (User Data) of length equal to Frame Size
+  For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame.
+  For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content.
+*/
+
+/*=====   Buffer-less streaming decompression functions  =====*/
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
 typedef struct {
-	unsigned long long frameContentSize;
-	unsigned int windowSize;
-	unsigned int dictID;
-	unsigned int checksumFlag;
-} ZSTD_frameParams;
-
-/**
- * ZSTD_getFrameParams() - extracts parameters from a zstd or skippable frame
- * @fparamsPtr: On success the frame parameters are written here.
- * @src:        The source buffer. It must point to a zstd or skippable frame.
- * @srcSize:    The size of the source buffer. `ZSTD_frameHeaderSize_max` is
- *              always large enough to succeed.
- *
- * Return:      0 on success. If more data is required it returns how many bytes
- *              must be provided to make forward progress. Otherwise it returns
- *              an error, which can be checked using ZSTD_isError().
- */
-size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src,
-	size_t srcSize);
-
-/*-*****************************************************************************
- * Buffer-less and synchronous inner streaming functions
- *
- * This is an advanced API, giving full control over buffer management, for
- * users which need direct control over memory.
- * But it's also a complex one, with many restrictions (documented below).
- * Prefer using normal streaming API for an easier experience
- ******************************************************************************/
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* misc */
+ZSTDLIB_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+
+
+
+
+/* ============================ */
+/*       Block level API       */
+/* ============================ */
+
+/*!
+    Block functions produce and decode raw zstd blocks, without frame metadata.
+    Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+    But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Compressing and decompressing require a context structure
+      + Use ZSTD_createCCtx() and ZSTD_createDCtx()
+    - It is necessary to init context before starting
+      + compression : any ZSTD_compressBegin*() variant, including with dictionary
+      + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+      + copyCCtx() and copyDCtx() can be used too
+    - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+      + If input is larger than a block size, it's necessary to split input data into multiple blocks
+      + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+        Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block.
+    - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) !
+      ===> In which case, nothing is produced into `dst` !
+      + User __must__ test for such outcome and deal directly with uncompressed data
+      + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0.
+        Doing so would mess up with statistics history, leading to potential data corruption.
+      + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !!
+      + In case of multiple successive blocks, should some of them be uncompressed,
+        decoder must be informed of their existence in order to follow proper history.
+        Use ZSTD_insertBlock() for such a case.
+*/
 
-/*-*****************************************************************************
- * Buffer-less streaming compression (synchronous mode)
- *
- * A ZSTD_CCtx object is required to track streaming operations.
- * Use ZSTD_initCCtx() to initialize a context.
- * ZSTD_CCtx object can be re-used multiple times within successive compression
- * operations.
- *
- * Start by initializing a context.
- * Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary
- * compression,
- * or ZSTD_compressBegin_advanced(), for finer parameter control.
- * It's also possible to duplicate a reference context which has already been
- * initialized, using ZSTD_copyCCtx()
- *
- * Then, consume your input using ZSTD_compressContinue().
- * There are some important considerations to keep in mind when using this
- * advanced function :
- * - ZSTD_compressContinue() has no internal buffer. It uses externally provided
- *   buffer only.
- * - Interface is synchronous : input is consumed entirely and produce 1+
- *   (or more) compressed blocks.
- * - Caller must ensure there is enough space in `dst` to store compressed data
- *   under worst case scenario. Worst case evaluation is provided by
- *   ZSTD_compressBound().
- *   ZSTD_compressContinue() doesn't guarantee recover after a failed
- *   compression.
- * - ZSTD_compressContinue() presumes prior input ***is still accessible and
- *   unmodified*** (up to maximum distance size, see WindowLog).
- *   It remembers all previous contiguous blocks, plus one separated memory
- *   segment (which can itself consists of multiple contiguous blocks)
- * - ZSTD_compressContinue() detects that prior input has been overwritten when
- *   `src` buffer overlaps. In which case, it will "discard" the relevant memory
- *   section from its history.
- *
- * Finish a frame with ZSTD_compressEnd(), which will write the last block(s)
- * and optional checksum. It's possible to use srcSize==0, in which case, it
- * will write a final empty block to end the frame. Without last block mark,
- * frames will be considered unfinished (corrupted) by decoders.
- *
- * `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new
- * frame.
- ******************************************************************************/
+/*=====   Raw zstd block functions  =====*/
+ZSTDLIB_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
 
-/*=====   Buffer-less streaming compression functions  =====*/
-size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel);
-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict,
-	size_t dictSize, int compressionLevel);
-size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict,
-	size_t dictSize, ZSTD_parameters params,
-	unsigned long long pledgedSrcSize);
-size_t ZSTD_copyCCtx(ZSTD_CCtx *cctx, const ZSTD_CCtx *preparedCCtx,
-	unsigned long long pledgedSrcSize);
-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict,
-	unsigned long long pledgedSrcSize);
-size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-
-
-
-/*-*****************************************************************************
- * Buffer-less streaming decompression (synchronous mode)
- *
- * A ZSTD_DCtx object is required to track streaming operations.
- * Use ZSTD_initDCtx() to initialize a context.
- * A ZSTD_DCtx object can be re-used multiple times.
- *
- * First typical operation is to retrieve frame parameters, using
- * ZSTD_getFrameParams(). It fills a ZSTD_frameParams structure which provide
- * important information to correctly decode the frame, such as the minimum
- * rolling buffer size to allocate to decompress data (`windowSize`), and the
- * dictionary ID used.
- * Note: content size is optional, it may not be present. 0 means unknown.
- * Note that these values could be wrong, either because of data malformation,
- * or because an attacker is spoofing deliberate false information. As a
- * consequence, check that values remain within valid application range,
- * especially `windowSize`, before allocation. Each application can set its own
- * limit, depending on local restrictions. For extended interoperability, it is
- * recommended to support at least 8 MB.
- * Frame parameters are extracted from the beginning of the compressed frame.
- * Data fragment must be large enough to ensure successful decoding, typically
- * `ZSTD_frameHeaderSize_max` bytes.
- * Result: 0: successful decoding, the `ZSTD_frameParams` structure is filled.
- *        >0: `srcSize` is too small, provide at least this many bytes.
- *        errorCode, which can be tested using ZSTD_isError().
- *
- * Start decompression, with ZSTD_decompressBegin() or
- * ZSTD_decompressBegin_usingDict(). Alternatively, you can copy a prepared
- * context, using ZSTD_copyDCtx().
- *
- * Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue()
- * alternatively.
- * ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize'
- * to ZSTD_decompressContinue().
- * ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will
- * fail.
- *
- * The result of ZSTD_decompressContinue() is the number of bytes regenerated
- * within 'dst' (necessarily <= dstCapacity). It can be zero, which is not an
- * error; it just means ZSTD_decompressContinue() has decoded some metadata
- * item. It can also be an error code, which can be tested with ZSTD_isError().
- *
- * ZSTD_decompressContinue() needs previous data blocks during decompression, up
- * to `windowSize`. They should preferably be located contiguously, prior to
- * current block. Alternatively, a round buffer of sufficient size is also
- * possible. Sufficient size is determined by frame parameters.
- * ZSTD_decompressContinue() is very sensitive to contiguity, if 2 blocks don't
- * follow each other, make sure that either the compressor breaks contiguity at
- * the same place, or that previous contiguous segment is large enough to
- * properly handle maximum back-reference.
- *
- * A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
- * Context can then be reset to start a new decompression.
- *
- * Note: it's possible to know if next input to present is a header or a block,
- * using ZSTD_nextInputType(). This information is not required to properly
- * decode a frame.
- *
- * == Special case: skippable frames ==
- *
- * Skippable frames allow integration of user-defined data into a flow of
- * concatenated frames. Skippable frames will be ignored (skipped) by a
- * decompressor. The format of skippable frames is as follows:
- * a) Skippable frame ID - 4 Bytes, Little endian format, any value from
- *    0x184D2A50 to 0x184D2A5F
- * b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
- * c) Frame Content - any content (User Data) of length equal to Frame Size
- * For skippable frames ZSTD_decompressContinue() always returns 0.
- * For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0
- * what means that a frame is skippable.
- * Note: If fparamsPtr->frameContentSize==0, it is ambiguous: the frame might
- *       actually be a zstd encoded frame with no content. For purposes of
- *       decompression, it is valid in both cases to skip the frame using
- *       ZSTD_findFrameCompressedSize() to find its size in bytes.
- * It also returns frame size as fparamsPtr->frameContentSize.
- ******************************************************************************/
 
-/*=====   Buffer-less streaming decompression functions  =====*/
-size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx);
-size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict,
-	size_t dictSize);
-void   ZSTD_copyDCtx(ZSTD_DCtx *dctx, const ZSTD_DCtx *preparedDCtx);
-size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx);
-size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-typedef enum {
-	ZSTDnit_frameHeader,
-	ZSTDnit_blockHeader,
-	ZSTDnit_block,
-	ZSTDnit_lastBlock,
-	ZSTDnit_checksum,
-	ZSTDnit_skippableFrame
-} ZSTD_nextInputType_e;
-ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx);
-
-/*-*****************************************************************************
- * Block functions
- *
- * Block functions produce and decode raw zstd blocks, without frame metadata.
- * Frame metadata cost is typically ~18 bytes, which can be non-negligible for
- * very small blocks (< 100 bytes). User will have to take in charge required
- * information to regenerate data, such as compressed and content sizes.
- *
- * A few rules to respect:
- * - Compressing and decompressing require a context structure
- *   + Use ZSTD_initCCtx() and ZSTD_initDCtx()
- * - It is necessary to init context before starting
- *   + compression : ZSTD_compressBegin()
- *   + decompression : ZSTD_decompressBegin()
- *   + variants _usingDict() are also allowed
- *   + copyCCtx() and copyDCtx() work too
- * - Block size is limited, it must be <= ZSTD_getBlockSizeMax()
- *   + If you need to compress more, cut data into multiple blocks
- *   + Consider using the regular ZSTD_compress() instead, as frame metadata
- *     costs become negligible when source size is large.
- * - When a block is considered not compressible enough, ZSTD_compressBlock()
- *   result will be zero. In which case, nothing is produced into `dst`.
- *   + User must test for such outcome and deal directly with uncompressed data
- *   + ZSTD_decompressBlock() doesn't accept uncompressed data as input!!!
- *   + In case of multiple successive blocks, decoder must be informed of
- *     uncompressed block existence to follow proper history. Use
- *     ZSTD_insertBlock() in such a case.
- ******************************************************************************/
+#endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
 
-/* Define for static allocation */
-#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024)
-/*=====   Raw zstd block functions  =====*/
-size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx);
-size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
-	const void *src, size_t srcSize);
-size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart,
-	size_t blockSize);
-
-#endif  /* ZSTD_H */
-- 
cgit v1.2.3


From fa443bc3c1e4b28d9315dea882e8358ba6e26f8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Fri, 29 Oct 2021 17:28:56 +0200
Subject: HID: intel-ish-hid: add support for MODULE_DEVICE_TABLE()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows to selectively autoload drivers for ISH devices.
Currently all ISH drivers are loaded for all systems having any ISH
device.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/mod_devicetable.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index ae2e75d15b21..befbf53c4b7c 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -895,4 +895,17 @@ struct dfl_device_id {
 	kernel_ulong_t driver_data;
 };
 
+/* ISHTP (Integrated Sensor Hub Transport Protocol) */
+
+#define ISHTP_MODULE_PREFIX	"ishtp:"
+
+/**
+ * struct ishtp_device_id - ISHTP device identifier
+ * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
+ * @context: pointer to driver specific data
+ */
+struct ishtp_device_id {
+	guid_t guid;
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
-- 
cgit v1.2.3


From 9ef4d0209cbadb63656a7aa29fde49c27ab2b9bf Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 9 Nov 2021 15:11:41 +0800
Subject: blk-mq: add one API for waiting until quiesce is done

Some drivers(NVMe, SCSI) need to call quiesce and unquiesce in pair, but it
is hard to switch to this style, so these drivers need one atomic flag for
helping to balance quiesce and unquiesce.

When quiesce is in-progress, the driver still needs to wait until
the quiesce is done, so add API of blk_mq_wait_quiesce_done() for
these drivers.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20211109071144.181581-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b4039fdf1b04..d53ee59ba131 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -803,6 +803,7 @@ void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_quiesce_queue(struct request_queue *q);
+void blk_mq_wait_quiesce_done(struct request_queue *q);
 void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
-- 
cgit v1.2.3


From 51b8c1fe250d1bd70c1722dc3c414f5cff2d7cca Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 8 Nov 2021 18:31:24 -0800
Subject: vfs: keep inodes with page cache off the inode shrinker LRU

Historically (pre-2.5), the inode shrinker used to reclaim only empty
inodes and skip over those that still contained page cache.  This caused
problems on highmem hosts: struct inode could put fill lowmem zones
before the cache was getting reclaimed in the highmem zones.

To address this, the inode shrinker started to strip page cache to
facilitate reclaiming lowmem.  However, this comes with its own set of
problems: the shrinkers may drop actively used page cache just because
the inodes are not currently open or dirty - think working with a large
git tree.  It further doesn't respect cgroup memory protection settings
and can cause priority inversions between containers.

Nowadays, the page cache also holds non-resident info for evicted cache
pages in order to detect refaults.  We've come to rely heavily on this
data inside reclaim for protecting the cache workingset and driving swap
behavior.  We also use it to quantify and report workload health through
psi.  The latter in turn is used for fleet health monitoring, as well as
driving automated memory sizing of workloads and containers, proactive
reclaim and memory offloading schemes.

The consequences of dropping page cache prematurely is that we're seeing
subtle and not-so-subtle failures in all of the above-mentioned
scenarios, with the workload generally entering unexpected thrashing
states while losing the ability to reliably detect it.

To fix this on non-highmem systems at least, going back to rotating
inodes on the LRU isn't feasible.  We've tried (commit a76cf1a474d7
("mm: don't reclaim inodes with many attached pages")) and failed
(commit 69056ee6a8a3 ("Revert "mm: don't reclaim inodes with many
attached pages"")).

The issue is mostly that shrinker pools attract pressure based on their
size, and when objects get skipped the shrinkers remember this as
deferred reclaim work.  This accumulates excessive pressure on the
remaining inodes, and we can quickly eat into heavily used ones, or
dirty ones that require IO to reclaim, when there potentially is plenty
of cold, clean cache around still.

Instead, this patch keeps populated inodes off the inode LRU in the
first place - just like an open file or dirty state would.  An otherwise
clean and unused inode then gets queued when the last cache entry
disappears.  This solves the problem without reintroducing the reclaim
issues, and generally is a bit more scalable than having to wade through
potentially hundreds of thousands of busy inodes.

Locking is a bit tricky because the locks protecting the inode state
(i_lock) and the inode LRU (lru_list.lock) don't nest inside the
irq-safe page cache lock (i_pages.xa_lock).  Page cache deletions are
serialized through i_lock, taken before the i_pages lock, to make sure
depopulated inodes are queued reliably.  Additions may race with
deletions, but we'll check again in the shrinker.  If additions race
with the shrinker itself, we're protected by the i_lock: if find_inode()
or iput() win, the shrinker will bail on the elevated i_count or
I_REFERENCED; if the shrinker wins and goes ahead with the inode, it
will set I_FREEING and inhibit further igets(), which will cause the
other side to create a new instance of the inode instead.

Link: https://lkml.kernel.org/r/20210614211904.14420-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h      |  1 +
 include/linux/pagemap.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 226de651f52e..de35a2640e70 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3193,6 +3193,7 @@ static inline void remove_inode_hash(struct inode *inode)
 }
 
 extern void inode_sb_list_add(struct inode *inode);
+extern void inode_add_lru(struct inode *inode);
 
 extern int sb_set_blocksize(struct super_block *, int);
 extern int sb_min_blocksize(struct super_block *, int);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 62db6b0176b9..5c74a45ff97a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -23,6 +23,56 @@ static inline bool mapping_empty(struct address_space *mapping)
 	return xa_empty(&mapping->i_pages);
 }
 
+/*
+ * mapping_shrinkable - test if page cache state allows inode reclaim
+ * @mapping: the page cache mapping
+ *
+ * This checks the mapping's cache state for the pupose of inode
+ * reclaim and LRU management.
+ *
+ * The caller is expected to hold the i_lock, but is not required to
+ * hold the i_pages lock, which usually protects cache state. That's
+ * because the i_lock and the list_lru lock that protect the inode and
+ * its LRU state don't nest inside the irq-safe i_pages lock.
+ *
+ * Cache deletions are performed under the i_lock, which ensures that
+ * when an inode goes empty, it will reliably get queued on the LRU.
+ *
+ * Cache additions do not acquire the i_lock and may race with this
+ * check, in which case we'll report the inode as shrinkable when it
+ * has cache pages. This is okay: the shrinker also checks the
+ * refcount and the referenced bit, which will be elevated or set in
+ * the process of adding new cache pages to an inode.
+ */
+static inline bool mapping_shrinkable(struct address_space *mapping)
+{
+	void *head;
+
+	/*
+	 * On highmem systems, there could be lowmem pressure from the
+	 * inodes before there is highmem pressure from the page
+	 * cache. Make inodes shrinkable regardless of cache state.
+	 */
+	if (IS_ENABLED(CONFIG_HIGHMEM))
+		return true;
+
+	/* Cache completely empty? Shrink away. */
+	head = rcu_access_pointer(mapping->i_pages.xa_head);
+	if (!head)
+		return true;
+
+	/*
+	 * The xarray stores single offset-0 entries directly in the
+	 * head pointer, which allows non-resident page cache entries
+	 * to escape the shadow shrinker's list of xarray nodes. The
+	 * inode shrinker needs to pick them up under memory pressure.
+	 */
+	if (!xa_is_node(head) && xa_is_value(head))
+		return true;
+
+	return false;
+}
+
 /*
  * Bits in mapping->flags.
  */
-- 
cgit v1.2.3


From 83c1fd763b3220458652f7d656d5047292ebcde4 Mon Sep 17 00:00:00 2001
From: zhangyiru <zhangyiru3@huawei.com>
Date: Mon, 8 Nov 2021 18:31:27 -0800
Subject: mm,hugetlb: remove mlock ulimit for SHM_HUGETLB

Commit 21a3c273f88c ("mm, hugetlb: add thread name and pid to
SHM_HUGETLB mlock rlimit warning") marked this as deprecated in 2012,
but it is not deleted yet.

Mike says he still sees that message in log files on occasion, so maybe we
should preserve this warning.

Also remove hugetlbfs related user_shm_unlock in ipc/shm.c and remove the
user_shm_unlock after out.

Link: https://lkml.kernel.org/r/20211103105857.25041-1-zhangyiru3@huawei.com
Signed-off-by: zhangyiru <zhangyiru3@huawei.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Liu Zixian <liuzixian4@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: wuxu.wu <wuxu.wu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 44c2ab0dfa59..00351ccb49a3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -477,8 +477,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
 extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
-				struct ucounts **ucounts, int creat_flags,
-				int page_size_log);
+				int creat_flags, int page_size_log);
 
 static inline bool is_file_hugepages(struct file *file)
 {
@@ -497,8 +496,7 @@ static inline struct hstate *hstate_inode(struct inode *i)
 #define is_file_hugepages(file)			false
 static inline struct file *
 hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
-		struct ucounts **ucounts, int creat_flags,
-		int page_size_log)
+		int creat_flags, int page_size_log)
 {
 	return ERR_PTR(-ENOSYS);
 }
-- 
cgit v1.2.3


From cc5f2704c93456e6f1c671c5cf7b582a55df5484 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 8 Nov 2021 18:31:48 -0800
Subject: proc/vmcore: convert oldmem_pfn_is_ram callback to more generic
 vmcore callbacks

Let's support multiple registered callbacks, making sure that
registering vmcore callbacks cannot fail.  Make the callback return a
bool instead of an int, handling how to deal with errors internally.
Drop unused HAVE_OLDMEM_PFN_IS_RAM.

We soon want to make use of this infrastructure from other drivers:
virtio-mem, registering one callback for each virtio-mem device, to
prevent reading unplugged virtio-mem memory.

Handle it via a generic vmcore_cb structure, prepared for future
extensions: for example, once we support virtio-mem on s390x where the
vmcore is completely constructed in the second kernel, we want to detect
and add plugged virtio-mem memory ranges to the vmcore in order for them
to get dumped properly.

Handle corner cases that are unexpected and shouldn't happen in sane
setups: registering a callback after the vmcore has already been opened
(warn only) and unregistering a callback after the vmcore has already been
opened (warn and essentially read only zeroes from that point on).

Link: https://lkml.kernel.org/r/20211005121430.30136-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/crash_dump.h | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 2618577a4d6d..0c547d866f1e 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -91,9 +91,29 @@ static inline void vmcore_unusable(void)
 		elfcorehdr_addr = ELFCORE_ADDR_ERR;
 }
 
-#define HAVE_OLDMEM_PFN_IS_RAM 1
-extern int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn));
-extern void unregister_oldmem_pfn_is_ram(void);
+/**
+ * struct vmcore_cb - driver callbacks for /proc/vmcore handling
+ * @pfn_is_ram: check whether a PFN really is RAM and should be accessed when
+ *              reading the vmcore. Will return "true" if it is RAM or if the
+ *              callback cannot tell. If any callback returns "false", it's not
+ *              RAM and the page must not be accessed; zeroes should be
+ *              indicated in the vmcore instead. For example, a ballooned page
+ *              contains no data and reading from such a page will cause high
+ *              load in the hypervisor.
+ * @next: List head to manage registered callbacks internally; initialized by
+ *        register_vmcore_cb().
+ *
+ * vmcore callbacks allow drivers managing physical memory ranges to
+ * coordinate with vmcore handling code, for example, to prevent accessing
+ * physical memory ranges that should not be accessed when reading the vmcore,
+ * although included in the vmcore header as memory ranges to dump.
+ */
+struct vmcore_cb {
+	bool (*pfn_is_ram)(struct vmcore_cb *cb, unsigned long pfn);
+	struct list_head next;
+};
+extern void register_vmcore_cb(struct vmcore_cb *cb);
+extern void unregister_vmcore_cb(struct vmcore_cb *cb);
 
 #else /* !CONFIG_CRASH_DUMP */
 static inline bool is_kdump_kernel(void) { return 0; }
-- 
cgit v1.2.3


From f5d80614844a725cbf66fd6524e01c218ede2141 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 8 Nov 2021 18:32:08 -0800
Subject: kernel.h: drop unneeded <linux/kernel.h> inclusion from other headers

Patch series "kernel.h further split", v5.

kernel.h is a set of something which is not related to each other and
often used in non-crossed compilation units, especially when drivers
need only one or two macro definitions from it.

This patch (of 7):

There is no evidence we need kernel.h inclusion in certain headers.
Drop unneeded <linux/kernel.h> inclusion from other headers.

[sfr@canb.auug.org.au: bottom_half.h needs kernel]
  Link: https://lkml.kernel.org/r/20211015202908.1c417ae2@canb.auug.org.au

Link: https://lkml.kernel.org/r/20211013170417.87909-1-andriy.shevchenko@linux.intel.com
Link: https://lkml.kernel.org/r/20211013170417.87909-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Thorsten Leemhuis <regressions@leemhuis.info>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bottom_half.h | 1 +
 include/linux/rwsem.h       | 1 -
 include/linux/smp.h         | 1 -
 include/linux/spinlock.h    | 1 -
 4 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index eed86eb0a1de..11d107d88d03 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_BH_H
 #define _LINUX_BH_H
 
+#include <linux/kernel.h>
 #include <linux/preempt.h>
 
 #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS)
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 352c6127cb90..f9348769e558 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -11,7 +11,6 @@
 #include <linux/linkage.h>
 
 #include <linux/types.h>
-#include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 510519e8a1eb..a80ab58ae3f1 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -108,7 +108,6 @@ static inline void on_each_cpu_cond(smp_cond_func_t cond_func,
 #ifdef CONFIG_SMP
 
 #include <linux/preempt.h>
-#include <linux/kernel.h>
 #include <linux/compiler.h>
 #include <linux/thread_info.h>
 #include <asm/smp.h>
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 45310ea1b1d7..22d672f81705 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -57,7 +57,6 @@
 #include <linux/compiler.h>
 #include <linux/irqflags.h>
 #include <linux/thread_info.h>
-#include <linux/kernel.h>
 #include <linux/stringify.h>
 #include <linux/bottom_half.h>
 #include <linux/lockdep.h>
-- 
cgit v1.2.3


From d2a8ebbf8192b84b11f1b204c4f7c602df32aeac Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 8 Nov 2021 18:32:12 -0800
Subject: kernel.h: split out container_of() and typeof_member() macros

kernel.h is being used as a dump for all kinds of stuff for a long time.
Here is the attempt cleaning it up by splitting out container_of() and
typeof_member() macros.

For time being include new header back to kernel.h to avoid twisted
indirected includes for existing users.

Note, there are _a lot_ of headers and modules that include kernel.h
solely for one of these macros and this allows to unburden compiler for
the twisted inclusion paths and to make new code cleaner in the future.

Link: https://lkml.kernel.org/r/20211013170417.87909-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thorsten Leemhuis <regressions@leemhuis.info>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/container_of.h | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/kernel.h       | 33 +--------------------------------
 2 files changed, 41 insertions(+), 32 deletions(-)
 create mode 100644 include/linux/container_of.h

(limited to 'include/linux')

diff --git a/include/linux/container_of.h b/include/linux/container_of.h
new file mode 100644
index 000000000000..dd56019838c6
--- /dev/null
+++ b/include/linux/container_of.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CONTAINER_OF_H
+#define _LINUX_CONTAINER_OF_H
+
+#include <linux/build_bug.h>
+#include <linux/err.h>
+
+#define typeof_member(T, m)	typeof(((T*)0)->m)
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({				\
+	void *__mptr = (void *)(ptr);					\
+	BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
+			 !__same_type(*(ptr), void),			\
+			 "pointer type mismatch in container_of()");	\
+	((type *)(__mptr - offsetof(type, member))); })
+
+/**
+ * container_of_safe - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ * If IS_ERR_OR_NULL(ptr), ptr is returned unchanged.
+ */
+#define container_of_safe(ptr, type, member) ({				\
+	void *__mptr = (void *)(ptr);					\
+	BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
+			 !__same_type(*(ptr), void),			\
+			 "pointer type mismatch in container_of()");	\
+	IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) :			\
+		((type *)(__mptr - offsetof(type, member))); })
+
+#endif	/* _LINUX_CONTAINER_OF_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 471bc0593679..ed4465757cec 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -9,6 +9,7 @@
 #include <linux/stddef.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
+#include <linux/container_of.h>
 #include <linux/bitops.h>
 #include <linux/kstrtox.h>
 #include <linux/log2.h>
@@ -52,8 +53,6 @@
 }					\
 )
 
-#define typeof_member(T, m)	typeof(((T*)0)->m)
-
 #define _RET_IP_		(unsigned long)__builtin_return_address(0)
 #define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
 
@@ -484,36 +483,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #define __CONCAT(a, b) a ## b
 #define CONCATENATE(a, b) __CONCAT(a, b)
 
-/**
- * container_of - cast a member of a structure out to the containing structure
- * @ptr:	the pointer to the member.
- * @type:	the type of the container struct this is embedded in.
- * @member:	the name of the member within the struct.
- *
- */
-#define container_of(ptr, type, member) ({				\
-	void *__mptr = (void *)(ptr);					\
-	BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
-			 !__same_type(*(ptr), void),			\
-			 "pointer type mismatch in container_of()");	\
-	((type *)(__mptr - offsetof(type, member))); })
-
-/**
- * container_of_safe - cast a member of a structure out to the containing structure
- * @ptr:	the pointer to the member.
- * @type:	the type of the container struct this is embedded in.
- * @member:	the name of the member within the struct.
- *
- * If IS_ERR_OR_NULL(ptr), ptr is returned unchanged.
- */
-#define container_of_safe(ptr, type, member) ({				\
-	void *__mptr = (void *)(ptr);					\
-	BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
-			 !__same_type(*(ptr), void),			\
-			 "pointer type mismatch in container_of()");	\
-	IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) :			\
-		((type *)(__mptr - offsetof(type, member))); })
-
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
-- 
cgit v1.2.3


From cd7187e112c91186185c0553b1f9d033ed399d3a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 8 Nov 2021 18:32:19 -0800
Subject: include/linux/list.h: replace kernel.h with the necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Link: https://lkml.kernel.org/r/20211013170417.87909-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thorsten Leemhuis <regressions@leemhuis.info>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index f2af4b4aa4e9..6636fc07f918 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -2,11 +2,13 @@
 #ifndef _LINUX_LIST_H
 #define _LINUX_LIST_H
 
+#include <linux/container_of.h>
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/poison.h>
 #include <linux/const.h>
-#include <linux/kernel.h>
+
+#include <asm/barrier.h>
 
 /*
  * Circular doubly linked list implementation.
-- 
cgit v1.2.3


From 50b09d6145da04e19fa448670e4918ce496f1c77 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 8 Nov 2021 18:32:22 -0800
Subject: include/linux/llist.h: replace kernel.h with the necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Link: https://lkml.kernel.org/r/20211013170417.87909-6-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thorsten Leemhuis <regressions@leemhuis.info>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/llist.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/llist.h b/include/linux/llist.h
index 24f207b0190b..85bda2d02d65 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -49,7 +49,9 @@
  */
 
 #include <linux/atomic.h>
-#include <linux/kernel.h>
+#include <linux/container_of.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
 
 struct llist_head {
 	struct llist_node *first;
-- 
cgit v1.2.3


From c540f9595956ff3c4b2cca1145c893941f921bcd Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 8 Nov 2021 18:32:25 -0800
Subject: include/linux/plist.h: replace kernel.h with the necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Link: https://lkml.kernel.org/r/20211013170417.87909-7-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thorsten Leemhuis <regressions@leemhuis.info>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/plist.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/plist.h b/include/linux/plist.h
index 66bab1bca35c..0f352c1d3c80 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -73,8 +73,11 @@
 #ifndef _LINUX_PLIST_H_
 #define _LINUX_PLIST_H_
 
-#include <linux/kernel.h>
+#include <linux/container_of.h>
 #include <linux/list.h>
+#include <linux/types.h>
+
+#include <asm/bug.h>
 
 struct plist_head {
 	struct list_head node_list;
-- 
cgit v1.2.3


From 5f6286a608107a68646241b57d2bd2a4fc139ef9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 8 Nov 2021 18:32:32 -0800
Subject: include/linux/delay.h: replace kernel.h with the necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

[akpm@linux-foundation.org: cxd2880_common.h needs bits.h for GENMASK()]
[andriy.shevchenko@linux.intel.com: delay.h: fix for removed kernel.h]
  Link: https://lkml.kernel.org/r/20211028170143.56523-1-andriy.shevchenko@linux.intel.com
[akpm@linux-foundation.org: include/linux/fwnode.h needs bits.h for BIT()]

Link: https://lkml.kernel.org/r/20211027150324.79827-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delay.h  | 2 +-
 include/linux/fwnode.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index 1d0e2ce6b6d9..8eacf67eb212 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -19,7 +19,7 @@
  *   https://lists.openwall.net/linux-kernel/2011/01/09/56
  */
 
-#include <linux/kernel.h>
+#include <linux/math.h>
 
 extern unsigned long loops_per_jiffy;
 
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 9f4ad719bfe3..3a532ba66f6c 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -11,6 +11,7 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/bits.h>
 #include <linux/err.h>
 
 struct fwnode_operations;
-- 
cgit v1.2.3


From 1fcbd5deac51f30a7ed3f10bea422d08a2b3aba6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 8 Nov 2021 18:32:35 -0800
Subject: include/linux/sbitmap.h: replace kernel.h with the necessary
 inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Link: https://lkml.kernel.org/r/20211027150437.79921-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sbitmap.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 2713e689ad66..8121034d1f13 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -9,8 +9,17 @@
 #ifndef __LINUX_SCALE_BITMAP_H
 #define __LINUX_SCALE_BITMAP_H
 
-#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/cache.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/minmax.h>
+#include <linux/percpu.h>
 #include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/types.h>
+#include <linux/wait.h>
 
 struct seq_file;
 
-- 
cgit v1.2.3


From 98e1385ef24bd607586fbf44e73decd5112d3d4a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 8 Nov 2021 18:32:38 -0800
Subject: include/linux/radix-tree.h: replace kernel.h with the necessary
 inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Link: https://lkml.kernel.org/r/20211027150528.80003-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 64ad900ac742..f7c1d21c2f39 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -9,8 +9,10 @@
 #define _LINUX_RADIX_TREE_H
 
 #include <linux/bitops.h>
-#include <linux/kernel.h>
+#include <linux/gfp.h>
 #include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/math.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
-- 
cgit v1.2.3


From b4b87651104dd32ab258d097d47b97e243a95222 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 8 Nov 2021 18:32:41 -0800
Subject: include/linux/generic-radix-tree.h: replace kernel.h with the
 necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

[akpm@linux-foundation.org: include math.h for round_up()]

Link: https://lkml.kernel.org/r/20211027150548.80042-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/generic-radix-tree.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index bfd00320c7f3..107613f7d792 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -38,8 +38,9 @@
 
 #include <asm/page.h>
 #include <linux/bug.h>
-#include <linux/kernel.h>
 #include <linux/log2.h>
+#include <linux/math.h>
+#include <linux/types.h>
 
 struct genradix_root;
 
-- 
cgit v1.2.3


From e52340de11d8bca3ba35e5a72fea4f2a5b6abbbb Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 8 Nov 2021 18:32:43 -0800
Subject: kernel.h: split out instruction pointer accessors

bottom_half.h needs _THIS_IP_ to be standalone, so split that and
_RET_IP_ out from kernel.h into the new instruction_pointer.h.  kernel.h
directly needs them, so include it there and replace the include of
kernel.h with this new file in bottom_half.h.

Link: https://lkml.kernel.org/r/20211028161248.45232-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bottom_half.h         | 2 +-
 include/linux/instruction_pointer.h | 8 ++++++++
 include/linux/kernel.h              | 4 +---
 3 files changed, 10 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/instruction_pointer.h

(limited to 'include/linux')

diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index 11d107d88d03..fc53e0ad56d9 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_BH_H
 #define _LINUX_BH_H
 
-#include <linux/kernel.h>
+#include <linux/instruction_pointer.h>
 #include <linux/preempt.h>
 
 #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS)
diff --git a/include/linux/instruction_pointer.h b/include/linux/instruction_pointer.h
new file mode 100644
index 000000000000..cda1f706eaeb
--- /dev/null
+++ b/include/linux/instruction_pointer.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_INSTRUCTION_POINTER_H
+#define _LINUX_INSTRUCTION_POINTER_H
+
+#define _RET_IP_		(unsigned long)__builtin_return_address(0)
+#define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
+
+#endif /* _LINUX_INSTRUCTION_POINTER_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index ed4465757cec..46ca4404fb93 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -20,6 +20,7 @@
 #include <linux/printk.h>
 #include <linux/build_bug.h>
 #include <linux/static_call_types.h>
+#include <linux/instruction_pointer.h>
 #include <asm/byteorder.h>
 
 #include <uapi/linux/kernel.h>
@@ -53,9 +54,6 @@
 }					\
 )
 
-#define _RET_IP_		(unsigned long)__builtin_return_address(0)
-#define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
-
 /**
  * upper_32_bits - return bits 32-63 of a number
  * @n: the number we're accessing
-- 
cgit v1.2.3


From e1edc277e6f6dfb372216522dfc57f9381c39e35 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Mon, 8 Nov 2021 18:32:46 -0800
Subject: linux/container_of.h: switch to static_assert

_Static_assert() is evaluated already in the compiler's frontend, and
gives a somehat more to-the-point error, compared to the BUILD_BUG_ON
macro, which only fires after the optimizer has had a chance to
eliminate calls to functions marked with __attribute__((error)).  In
theory, this might make builds a tiny bit faster.

There's also a little less gunk in the error message emitted:

  lib/sort.c: In function `foo':
  include/linux/build_bug.h:78:41: error: static assertion failed: "pointer type mismatch in container_of()"
     78 | #define __static_assert(expr, msg, ...) _Static_assert(expr, msg)

compared to

  lib/sort.c: In function `foo':
  include/linux/compiler_types.h:322:38: error: call to `__compiletime_assert_2' declared with attribute error: pointer type mismatch in container_of()
    322 |  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)

While at it, fix the copy-pasto in container_of_safe().

Link: https://lkml.kernel.org/r/20211015090530.2774079-1-linux@rasmusvillemoes.dk
Link: https://lore.kernel.org/lkml/20211014132331.GA4811@kernel.org/T/
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/container_of.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/container_of.h b/include/linux/container_of.h
index dd56019838c6..2f4944b791b8 100644
--- a/include/linux/container_of.h
+++ b/include/linux/container_of.h
@@ -16,9 +16,9 @@
  */
 #define container_of(ptr, type, member) ({				\
 	void *__mptr = (void *)(ptr);					\
-	BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
-			 !__same_type(*(ptr), void),			\
-			 "pointer type mismatch in container_of()");	\
+	static_assert(__same_type(*(ptr), ((type *)0)->member) ||	\
+		      __same_type(*(ptr), void),			\
+		      "pointer type mismatch in container_of()");	\
 	((type *)(__mptr - offsetof(type, member))); })
 
 /**
@@ -31,9 +31,9 @@
  */
 #define container_of_safe(ptr, type, member) ({				\
 	void *__mptr = (void *)(ptr);					\
-	BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
-			 !__same_type(*(ptr), void),			\
-			 "pointer type mismatch in container_of()");	\
+	static_assert(__same_type(*(ptr), ((type *)0)->member) ||	\
+		      __same_type(*(ptr), void),			\
+		      "pointer type mismatch in container_of_safe()");	\
 	IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) :			\
 		((type *)(__mptr - offsetof(type, member))); })
 
-- 
cgit v1.2.3


From 505be48165fa2f2cb795659b4fc61f35563d105e Mon Sep 17 00:00:00 2001
From: Imran Khan <imran.f.khan@oracle.com>
Date: Mon, 8 Nov 2021 18:33:12 -0800
Subject: lib, stackdepot: add helper to print stack entries

To print a stack entries, users of stackdepot, first use stack_depot_fetch
to get a list of stack entries and then use stack_trace_print to print
this list.  Provide a helper in stackdepot to print stack entries based on
stackdepot handle.  Also change above mentioned users to use this helper.

Link: https://lkml.kernel.org/r/20210915014806.3206938-3-imran.f.khan@oracle.com
Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: David Airlie <airlied@linux.ie>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/stackdepot.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index d29860966bc9..8ab37500fcba 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -25,6 +25,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
 			       unsigned long **entries);
 
+void stack_depot_print(depot_stack_handle_t stack);
+
 #ifdef CONFIG_STACKDEPOT
 int stack_depot_init(void);
 #else
-- 
cgit v1.2.3


From 0f68d45ef41abb618a9ca33996348ae73800a106 Mon Sep 17 00:00:00 2001
From: Imran Khan <imran.f.khan@oracle.com>
Date: Mon, 8 Nov 2021 18:33:16 -0800
Subject: lib, stackdepot: add helper to print stack entries into buffer

To print stack entries into a buffer, users of stackdepot, first get a
list of stack entries using stack_depot_fetch and then print this list
into a buffer using stack_trace_snprint.  Provide a helper in stackdepot
for this purpose.  Also change above mentioned users to use this helper.

[imran.f.khan@oracle.com: fix build error]
  Link: https://lkml.kernel.org/r/20210915175321.3472770-4-imran.f.khan@oracle.com
[imran.f.khan@oracle.com: export stack_depot_snprint() to modules]
  Link: https://lkml.kernel.org/r/20210916133535.3592491-4-imran.f.khan@oracle.com

Link: https://lkml.kernel.org/r/20210915014806.3206938-4-imran.f.khan@oracle.com
Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Jani Nikula <jani.nikula@intel.com>	[i915]
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: David Airlie <airlied@linux.ie>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/stackdepot.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 8ab37500fcba..c34b55a6e554 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -25,6 +25,9 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
 			       unsigned long **entries);
 
+int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
+		       int spaces);
+
 void stack_depot_print(depot_stack_handle_t stack);
 
 #ifdef CONFIG_STACKDEPOT
-- 
cgit v1.2.3


From bfb3ba32061da1a9217ef6d02fbcffb528e4c8df Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Mon, 8 Nov 2021 18:33:19 -0800
Subject: include/linux/string_helpers.h: add linux/string.h for strlen()

linux/string_helpers.h uses strlen(), so include the correponding header.
Otherwise we get a compilation error if it's not also included by whoever
included the helper.

Link: https://lkml.kernel.org/r/20211005212634.3223113-1-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string_helpers.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 68189c4a2eb1..4ba39e1403b2 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -4,6 +4,7 @@
 
 #include <linux/bits.h>
 #include <linux/ctype.h>
+#include <linux/string.h>
 #include <linux/types.h>
 
 struct file;
-- 
cgit v1.2.3


From 1b1ad288b8f1b11f83396e537003722897ecc12b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 8 Nov 2021 18:33:43 -0800
Subject: kallsyms: remove arch specific text and data check

Patch series "sections: Unify kernel sections range check and use", v4.

There are three head files(kallsyms.h, kernel.h and sections.h) which
include the kernel sections range check, let's make some cleanup and unify
them.

1. cleanup arch specific text/data check and fix address boundary check
   in kallsyms.h

2. make all the basic/core kernel range check function into sections.h

3. update all the callers, and use the helper in sections.h to simplify
   the code

After this series, we have 5 APIs about kernel sections range check in
sections.h

 * is_kernel_rodata()		--- already in sections.h
 * is_kernel_core_data()	--- come from core_kernel_data() in kernel.h
 * is_kernel_inittext()		--- come from kernel.h and kallsyms.h
 * __is_kernel_text()		--- add new internal helper
 * __is_kernel()		--- add new internal helper

Note: For the last two helpers, people should not use directly, consider to
      use corresponding function in kallsyms.h.

This patch (of 11):

Remove arch specific text and data check after commit 4ba66a976072 ("arch:
remove blackfin port"), no need arch-specific text/data check.

Link: https://lkml.kernel.org/r/20210930071143.63410-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20210930071143.63410-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kallsyms.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index a1d6fc82d7f0..bac340972670 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -34,8 +34,7 @@ static inline int is_kernel_inittext(unsigned long addr)
 
 static inline int is_kernel_text(unsigned long addr)
 {
-	if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
-	    arch_is_kernel_text(addr))
+	if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext))
 		return 1;
 	return in_gate_area_no_mm(addr);
 }
-- 
cgit v1.2.3


From e7d5c4b0eb9be6ea5c2cfb510a9d6f56925118eb Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 8 Nov 2021 18:33:47 -0800
Subject: kallsyms: fix address-checks for kernel related range

The is_kernel_inittext/is_kernel_text/is_kernel function should not
include the end address(the labels _einittext, _etext and _end) when check
the address range, the issue exists since Linux v2.6.12.

Link: https://lkml.kernel.org/r/20210930071143.63410-3-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kallsyms.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index bac340972670..62ce22b27ea8 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -27,21 +27,21 @@ struct module;
 static inline int is_kernel_inittext(unsigned long addr)
 {
 	if (addr >= (unsigned long)_sinittext
-	    && addr <= (unsigned long)_einittext)
+	    && addr < (unsigned long)_einittext)
 		return 1;
 	return 0;
 }
 
 static inline int is_kernel_text(unsigned long addr)
 {
-	if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext))
+	if ((addr >= (unsigned long)_stext && addr < (unsigned long)_etext))
 		return 1;
 	return in_gate_area_no_mm(addr);
 }
 
 static inline int is_kernel(unsigned long addr)
 {
-	if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
+	if (addr >= (unsigned long)_stext && addr < (unsigned long)_end)
 		return 1;
 	return in_gate_area_no_mm(addr);
 }
-- 
cgit v1.2.3


From a20deb3a348719adaf8c12e1bf4b599bfc51836e Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 8 Nov 2021 18:33:51 -0800
Subject: sections: move and rename core_kernel_data() to is_kernel_core_data()

Move core_kernel_data() into sections.h and rename it to
is_kernel_core_data(), also make it return bool value, then update all the
callers.

Link: https://lkml.kernel.org/r/20210930071143.63410-4-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 46ca4404fb93..23f57a2d5a13 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -227,7 +227,6 @@ extern char *next_arg(char *args, char **param, char **val);
 
 extern int core_kernel_text(unsigned long addr);
 extern int init_kernel_text(unsigned long addr);
-extern int core_kernel_data(unsigned long addr);
 extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int func_ptr_is_kernel_text(void *ptr);
-- 
cgit v1.2.3


From b9ad8fe7b8cab3814d1de41e8ddca602b2646f4b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 8 Nov 2021 18:33:54 -0800
Subject: sections: move is_kernel_inittext() into sections.h

The is_kernel_inittext() and init_kernel_text() are with same
functionality, let's just keep is_kernel_inittext() and move it into
sections.h, then update all the callers.

Link: https://lkml.kernel.org/r/20210930071143.63410-5-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kallsyms.h | 8 --------
 include/linux/kernel.h   | 1 -
 2 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 62ce22b27ea8..bb5d9ec78e13 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -24,14 +24,6 @@
 struct cred;
 struct module;
 
-static inline int is_kernel_inittext(unsigned long addr)
-{
-	if (addr >= (unsigned long)_sinittext
-	    && addr < (unsigned long)_einittext)
-		return 1;
-	return 0;
-}
-
 static inline int is_kernel_text(unsigned long addr)
 {
 	if ((addr >= (unsigned long)_stext && addr < (unsigned long)_etext))
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 23f57a2d5a13..be84ab369650 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -226,7 +226,6 @@ extern bool parse_option_str(const char *str, const char *option);
 extern char *next_arg(char *args, char **param, char **val);
 
 extern int core_kernel_text(unsigned long addr);
-extern int init_kernel_text(unsigned long addr);
 extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int func_ptr_is_kernel_text(void *ptr);
-- 
cgit v1.2.3


From 8f6e42e83362650007ed298070a69d084f2c8baf Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 8 Nov 2021 18:34:02 -0800
Subject: sections: provide internal __is_kernel() and __is_kernel_text()
 helper

An internal __is_kernel() helper which only check the kernel address
ranges, and an internal __is_kernel_text() helper which only check text
section ranges.

Link: https://lkml.kernel.org/r/20210930071143.63410-7-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kallsyms.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index bb5d9ec78e13..4176c7eca7b5 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -26,14 +26,14 @@ struct module;
 
 static inline int is_kernel_text(unsigned long addr)
 {
-	if ((addr >= (unsigned long)_stext && addr < (unsigned long)_etext))
+	if (__is_kernel_text(addr))
 		return 1;
 	return in_gate_area_no_mm(addr);
 }
 
 static inline int is_kernel(unsigned long addr)
 {
-	if (addr >= (unsigned long)_stext && addr < (unsigned long)_end)
+	if (__is_kernel(addr))
 		return 1;
 	return in_gate_area_no_mm(addr);
 }
-- 
cgit v1.2.3


From 5605f41917c6ad766814a8e417a4481e9157c991 Mon Sep 17 00:00:00 2001
From: Changcheng Deng <deng.changcheng@zte.com.cn>
Date: Mon, 8 Nov 2021 18:35:07 -0800
Subject: crash_dump: fix boolreturn.cocci warning

./include/linux/crash_dump.h: 119: 50-51: WARNING: return of 0/1 in
function 'is_kdump_kernel' with return type bool

Return statements in functions returning bool should use true/false
instead of 1/0.

Link: https://lkml.kernel.org/r/20211020083905.1037952-1-deng.changcheng@zte.com.cn
Signed-off-by: Changcheng Deng <deng.changcheng@zte.com.cn>
Reported-by: Zeal Robot <zealci@zte.com.cn>
Reviewed-by: Simon Horman <horms@kernel.org>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Ye Guojin <ye.guojin@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/crash_dump.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 0c547d866f1e..979c26176c1d 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -116,7 +116,7 @@ extern void register_vmcore_cb(struct vmcore_cb *cb);
 extern void unregister_vmcore_cb(struct vmcore_cb *cb);
 
 #else /* !CONFIG_CRASH_DUMP */
-static inline bool is_kdump_kernel(void) { return 0; }
+static inline bool is_kdump_kernel(void) { return false; }
 #endif /* CONFIG_CRASH_DUMP */
 
 /* Device Dump information to be filled by drivers */
-- 
cgit v1.2.3


From a10677a028b853c25054a4b8be04013ccb55682f Mon Sep 17 00:00:00 2001
From: Ye Guojin <ye.guojin@zte.com.cn>
Date: Mon, 8 Nov 2021 18:35:10 -0800
Subject: crash_dump: remove duplicate include in crash_dump.h

In crash_dump.h, header file <linux/pgtable.h> is included twice.  This
duplication was introduced in commit 65fddcfca8ad("mm: reorder includes
after introduction of linux/pgtable.h") where the order of the header
files is adjusted, while the old one was not removed.

Clean it up here.

Link: https://lkml.kernel.org/r/20211020090659.1038877-1-ye.guojin@zte.com.cn
Signed-off-by: Ye Guojin <ye.guojin@zte.com.cn>
Reported-by: Zeal Robot <zealci@zte.com.cn>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Changcheng Deng <deng.changcheng@zte.com.cn>
Cc: Simon Horman <horms@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/crash_dump.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 979c26176c1d..620821549b23 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -8,8 +8,6 @@
 #include <linux/pgtable.h>
 #include <uapi/linux/vmcore.h>
 
-#include <linux/pgtable.h> /* for pgprot_t */
-
 /* For IS_ENABLED(CONFIG_CRASH_DUMP) */
 #define ELFCORE_ADDR_MAX	(-1ULL)
 #define ELFCORE_ADDR_ERR	(-2ULL)
-- 
cgit v1.2.3


From f26663684e76773ea86e2df13fb18f9d66c91151 Mon Sep 17 00:00:00 2001
From: Ye Guojin <ye.guojin@zte.com.cn>
Date: Mon, 8 Nov 2021 18:35:13 -0800
Subject: signal: remove duplicate include in signal.h

'linux/string.h' included in 'signal.h' is duplicated.
it's also included at line 7.

Link: https://lkml.kernel.org/r/20211019024934.973008-1-ye.guojin@zte.com.cn
Signed-off-by: Ye Guojin <ye.guojin@zte.com.cn>
Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/signal.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 3f96a6374e4f..37f5080c070a 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -126,7 +126,6 @@ static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
 #define sigmask(sig)	(1UL << ((sig) - 1))
 
 #ifndef __HAVE_ARCH_SIG_SETOPS
-#include <linux/string.h>
 
 #define _SIG_SET_BINOP(name, op)					\
 static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \
-- 
cgit v1.2.3


From 372904c080be44629d84bb15ed5e12eed44b5f9f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 8 Nov 2021 18:35:16 -0800
Subject: seq_file: move seq_escape() to a header

Move seq_escape() to the header as inliner, for a small kernel text size
reduction.

Link: https://lkml.kernel.org/r/20211001122917.67228-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/seq_file.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index dd99569595fd..103776e18555 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 #include <linux/string.h>
+#include <linux/string_helpers.h>
 #include <linux/bug.h>
 #include <linux/mutex.h>
 #include <linux/cpumask.h>
@@ -135,7 +136,21 @@ static inline void seq_escape_str(struct seq_file *m, const char *src,
 	seq_escape_mem(m, src, strlen(src), flags, esc);
 }
 
-void seq_escape(struct seq_file *m, const char *s, const char *esc);
+/**
+ * seq_escape - print string into buffer, escaping some characters
+ * @m: target buffer
+ * @s: NULL-terminated string
+ * @esc: set of characters that need escaping
+ *
+ * Puts string into buffer, replacing each occurrence of character from
+ * @esc with usual octal escape.
+ *
+ * Use seq_has_overflowed() to check for errors.
+ */
+static inline void seq_escape(struct seq_file *m, const char *s, const char *esc)
+{
+	seq_escape_str(m, s, ESCAPE_OCTAL, esc);
+}
 
 void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
 		  int rowsize, int groupsize, const void *buf, size_t len,
-- 
cgit v1.2.3


From 10a6de19cad6efb9b49883513afb810dc265fca2 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Mon, 8 Nov 2021 18:35:19 -0800
Subject: seq_file: fix passing wrong private data

DEFINE_PROC_SHOW_ATTRIBUTE() is supposed to be used to define a series
of functions and variables to register proc file easily. And the users
can use proc_create_data() to pass their own private data and get it
via seq->private in the callback. Unfortunately, the proc file system
use PDE_DATA() to get private data instead of inode->i_private. So fix
it. Fortunately, there only one user of it which does not pass any
private data, so this bug does not break any in-tree codes.

Link: https://lkml.kernel.org/r/20211029032638.84884-1-songmuchun@bytedance.com
Fixes: 97a32539b956 ("proc: convert everything to "struct proc_ops"")
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Florent Revest <revest@chromium.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/seq_file.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 103776e18555..72dbb44a4573 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -209,7 +209,7 @@ static const struct file_operations __name ## _fops = {			\
 #define DEFINE_PROC_SHOW_ATTRIBUTE(__name)				\
 static int __name ## _open(struct inode *inode, struct file *file)	\
 {									\
-	return single_open(file, __name ## _show, inode->i_private);	\
+	return single_open(file, __name ## _show, PDE_DATA(inode));	\
 }									\
 									\
 static const struct proc_ops __name ## _proc_ops = {			\
-- 
cgit v1.2.3


From 278167fd2f8ffe679351605fe03e29ff3ab8db18 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Tue, 9 Nov 2021 16:29:49 -0800
Subject: block: add __must_check for *add_disk*() callers

Now that we have done a spring cleaning on all drivers and added
error checking / handling, let's keep it that way and ensure
no new drivers fail to stick with it.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20211110002949.999380-1-mcgrof@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 462634b4b48f..74c410263113 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -205,9 +205,9 @@ static inline dev_t disk_devt(struct gendisk *disk)
 void disk_uevent(struct gendisk *disk, enum kobject_action action);
 
 /* block/genhd.c */
-int device_add_disk(struct device *parent, struct gendisk *disk,
-		const struct attribute_group **groups);
-static inline int add_disk(struct gendisk *disk)
+int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
+				 const struct attribute_group **groups);
+static inline int __must_check add_disk(struct gendisk *disk)
 {
 	return device_add_disk(NULL, disk, NULL);
 }
-- 
cgit v1.2.3


From 64355db3caf6468dc711995239efe0cbcd7d0091 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Wed, 10 Nov 2021 13:16:55 +0100
Subject: mod_devicetable: fix kdocs for ishtp_device_id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kdocs were copied from another device_id struct and not adapted.

Fixes: fa443bc3c1e4 ("HID: intel-ish-hid: add support for MODULE_DEVICE_TABLE()")
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/mod_devicetable.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index befbf53c4b7c..c70abe7aaef2 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -901,8 +901,7 @@ struct dfl_device_id {
 
 /**
  * struct ishtp_device_id - ISHTP device identifier
- * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
- * @context: pointer to driver specific data
+ * @guid: GUID of the device.
  */
 struct ishtp_device_id {
 	guid_t guid;
-- 
cgit v1.2.3


From 5d5e4522a7f404d1a96fd6c703989d32a9c9568d Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Sun, 7 Nov 2021 14:51:16 +1000
Subject: printk: restore flushing of NMI buffers on remote CPUs after NMI
 backtraces

printk from NMI context relies on irq work being raised on the local CPU
to print to console. This can be a problem if the NMI was raised by a
lockup detector to print lockup stack and regs, because the CPU may not
enable irqs (because it is locked up).

Introduce printk_trigger_flush() that can be called another CPU to try
to get those messages to the console, call that where printk_safe_flush
was previously called.

Fixes: 93d102f094be ("printk: remove safe buffers")
Cc: stable@vger.kernel.org # 5.15
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20211107045116.1754411-1-npiggin@gmail.com
---
 include/linux/printk.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index a1379df43251..596ad6fa0336 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -206,6 +206,7 @@ void dump_stack_print_info(const char *log_lvl);
 void show_regs_print_info(const char *log_lvl);
 extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
 extern asmlinkage void dump_stack(void) __cold;
+void printk_trigger_flush(void);
 #else
 static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
@@ -282,6 +283,9 @@ static inline void dump_stack_lvl(const char *log_lvl)
 static inline void dump_stack(void)
 {
 }
+static inline void printk_trigger_flush(void)
+{
+}
 #endif
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From a19672f6b9718df247a9087a266150dbe833022e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 12 Aug 2021 21:54:58 +0100
Subject: folio: Add a function to change the private data attached to a folio

Add a function, folio_change_private(), that will change the private data
attached to a folio, without the need to twiddle the private bit or the
refcount.  It assumes that folio_add_private() has already been called on
the page.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dominique Martinet <asmadeus@codewreck.org>
Tested-by: kafs-testing@auristor.com
Link: https://lore.kernel.org/r/162981149911.1901565.17776700811659843340.stgit@warthog.procyon.org.uk/
Link: https://lore.kernel.org/r/163005743485.2472992.5100702469503007023.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163584180781.4023316.5037526301198034310.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/163649324326.309189.17817587229450840783.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/163657848531.834781.14269656212269187893.stgit@warthog.procyon.org.uk/ # v5
---
 include/linux/pagemap.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 6a30916b76e5..1f560aecd9b5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -279,6 +279,25 @@ static inline void folio_attach_private(struct folio *folio, void *data)
 	folio_set_private(folio);
 }
 
+/**
+ * folio_change_private - Change private data on a folio.
+ * @folio: Folio to change the data on.
+ * @data: Data to set on the folio.
+ *
+ * Change the private data attached to a folio and return the old
+ * data.  The page must previously have had data attached and the data
+ * must be detached before the folio will be freed.
+ *
+ * Return: Data that was previously attached to the folio.
+ */
+static inline void *folio_change_private(struct folio *folio, void *data)
+{
+	void *old = folio_get_private(folio);
+
+	folio->private = data;
+	return old;
+}
+
 /**
  * folio_detach_private - Detach private data from a folio.
  * @folio: Folio to detach data from.
-- 
cgit v1.2.3


From 452c472e26348df1e7052544130aa98eebbd2331 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 12 Aug 2021 22:09:57 +0100
Subject: folio: Add a function to get the host inode for a folio

Add a convenience function, folio_inode() that will get the host inode from
a folio's mapping.

Changes:
 ver #3:
  - Fix mistake in function description[2].
 ver #2:
  - Fix contradiction between doc and implementation by disallowing use
    with swap caches[1].

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dominique Martinet <asmadeus@codewreck.org>
Tested-by: kafs-testing@auristor.com
Link: https://lore.kernel.org/r/YST8OcVNy02Rivbm@casper.infradead.org/ [1]
Link: https://lore.kernel.org/r/YYKLkBwQdtn4ja+i@casper.infradead.org/ [2]
Link: https://lore.kernel.org/r/162880453171.3369675.3704943108660112470.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/162981151155.1901565.7010079316994382707.stgit@warthog.procyon.org.uk/
Link: https://lore.kernel.org/r/163005744370.2472992.18324470937328925723.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163584184628.4023316.9386282630968981869.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/163649325519.309189.15072332908703129455.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/163657850401.834781.1031963517399283294.stgit@warthog.procyon.org.uk/ # v5
---
 include/linux/pagemap.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 1f560aecd9b5..1a0c646eb6ff 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -253,6 +253,20 @@ static inline struct address_space *page_mapping_file(struct page *page)
 	return folio_mapping(folio);
 }
 
+/**
+ * folio_inode - Get the host inode for this folio.
+ * @folio: The folio.
+ *
+ * For folios which are in the page cache, return the inode that this folio
+ * belongs to.
+ *
+ * Do not call this for folios which aren't in the page cache.
+ */
+static inline struct inode *folio_inode(struct folio *folio)
+{
+	return folio->mapping->host;
+}
+
 static inline bool page_cache_add_speculative(struct page *page, int count)
 {
 	VM_BUG_ON_PAGE(PageTail(page), page);
-- 
cgit v1.2.3


From 78525c74d9e7d1a6ce69bd4388f045f6e474a20b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 11 Aug 2021 09:49:13 +0100
Subject: netfs, 9p, afs, ceph: Use folios

Convert the netfs helper library to use folios throughout, convert the 9p
and afs filesystems to use folios in their file I/O paths and convert the
ceph filesystem to use just enough folios to compile.

With these changes, afs passes -g quick xfstests.

Changes
=======
ver #5:
 - Got rid of folio_end{io,_read,_write}() and inlined the stuff it does
   instead (Willy decided he didn't want this after all).

ver #4:
 - Fixed a bug in afs_redirty_page() whereby it didn't set the next page
   index in the loop and returned too early.
 - Simplified a check in v9fs_vfs_write_folio_locked()[1].
 - Undid a change to afs_symlink_readpage()[1].
 - Used offset_in_folio() in afs_write_end()[1].
 - Changed from using page_endio() to folio_end{io,_read,_write}()[1].

ver #2:
 - Add 9p foliation.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dominique Martinet <asmadeus@codewreck.org>
Tested-by: kafs-testing@auristor.com
cc: Matthew Wilcox (Oracle) <willy@infradead.org>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Dominique Martinet <asmadeus@codewreck.org>
cc: v9fs-developer@lists.sourceforge.net
cc: linux-afs@lists.infradead.org
cc: ceph-devel@vger.kernel.org
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/YYKa3bfQZxK5/wDN@casper.infradead.org/ [1]
Link: https://lore.kernel.org/r/2408234.1628687271@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/162877311459.3085614.10601478228012245108.stgit@warthog.procyon.org.uk/
Link: https://lore.kernel.org/r/162981153551.1901565.3124454657133703341.stgit@warthog.procyon.org.uk/
Link: https://lore.kernel.org/r/163005745264.2472992.9852048135392188995.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163584187452.4023316.500389675405550116.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/163649328026.309189.1124218109373941936.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/163657852454.834781.9265101983152100556.stgit@warthog.procyon.org.uk/ # v5
---
 include/linux/netfs.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 12c4177f7703..ca0683b9e3d1 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -166,13 +166,13 @@ struct netfs_read_request {
 	short			error;		/* 0 or error that occurred */
 	loff_t			i_size;		/* Size of the file */
 	loff_t			start;		/* Start position */
-	pgoff_t			no_unlock_page;	/* Don't unlock this page after read */
+	pgoff_t			no_unlock_folio; /* Don't unlock this folio after read */
 	refcount_t		usage;
 	unsigned long		flags;
 #define NETFS_RREQ_INCOMPLETE_IO	0	/* Some ioreqs terminated short or with error */
 #define NETFS_RREQ_WRITE_TO_CACHE	1	/* Need to write to the cache */
-#define NETFS_RREQ_NO_UNLOCK_PAGE	2	/* Don't unlock no_unlock_page on completion */
-#define NETFS_RREQ_DONT_UNLOCK_PAGES	3	/* Don't unlock the pages on completion */
+#define NETFS_RREQ_NO_UNLOCK_FOLIO	2	/* Don't unlock no_unlock_folio on completion */
+#define NETFS_RREQ_DONT_UNLOCK_FOLIOS	3	/* Don't unlock the folios on completion */
 #define NETFS_RREQ_FAILED		4	/* The request failed */
 #define NETFS_RREQ_IN_PROGRESS		5	/* Unlocked when the request completes */
 	const struct netfs_read_request_ops *netfs_ops;
@@ -190,7 +190,7 @@ struct netfs_read_request_ops {
 	void (*issue_op)(struct netfs_read_subrequest *subreq);
 	bool (*is_still_valid)(struct netfs_read_request *rreq);
 	int (*check_write_begin)(struct file *file, loff_t pos, unsigned len,
-				 struct page *page, void **_fsdata);
+				 struct folio *folio, void **_fsdata);
 	void (*done)(struct netfs_read_request *rreq);
 	void (*cleanup)(struct address_space *mapping, void *netfs_priv);
 };
@@ -240,11 +240,11 @@ extern void netfs_readahead(struct readahead_control *,
 			    const struct netfs_read_request_ops *,
 			    void *);
 extern int netfs_readpage(struct file *,
-			  struct page *,
+			  struct folio *,
 			  const struct netfs_read_request_ops *,
 			  void *);
 extern int netfs_write_begin(struct file *, struct address_space *,
-			     loff_t, unsigned int, unsigned int, struct page **,
+			     loff_t, unsigned int, unsigned int, struct folio **,
 			     void **,
 			     const struct netfs_read_request_ops *,
 			     void *);
-- 
cgit v1.2.3


From f89315650ba34ec6c91a8bded72796980bee2a4d Mon Sep 17 00:00:00 2001
From: Mark Pashmfouroush <markpash@cloudflare.com>
Date: Wed, 10 Nov 2021 11:10:15 +0000
Subject: bpf: Add ingress_ifindex to bpf_sk_lookup

It may be helpful to have access to the ifindex during bpf socket
lookup. An example may be to scope certain socket lookup logic to
specific interfaces, i.e. an interface may be made exempt from custom
lookup code.

Add the ifindex of the arriving connection to the bpf_sk_lookup API.

Signed-off-by: Mark Pashmfouroush <markpash@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211110111016.5670-2-markpash@cloudflare.com
---
 include/linux/filter.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 24b7ed2677af..b6a216eb217a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1374,6 +1374,7 @@ struct bpf_sk_lookup_kern {
 		const struct in6_addr *daddr;
 	} v6;
 	struct sock	*selected_sk;
+	u32		ingress_ifindex;
 	bool		no_reuseport;
 };
 
@@ -1436,7 +1437,7 @@ extern struct static_key_false bpf_sk_lookup_enabled;
 static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
 					const __be32 saddr, const __be16 sport,
 					const __be32 daddr, const u16 dport,
-					struct sock **psk)
+					const int ifindex, struct sock **psk)
 {
 	struct bpf_prog_array *run_array;
 	struct sock *selected_sk = NULL;
@@ -1452,6 +1453,7 @@ static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
 			.v4.daddr	= daddr,
 			.sport		= sport,
 			.dport		= dport,
+			.ingress_ifindex	= ifindex,
 		};
 		u32 act;
 
@@ -1474,7 +1476,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 					const __be16 sport,
 					const struct in6_addr *daddr,
 					const u16 dport,
-					struct sock **psk)
+					const int ifindex, struct sock **psk)
 {
 	struct bpf_prog_array *run_array;
 	struct sock *selected_sk = NULL;
@@ -1490,6 +1492,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 			.v6.daddr	= daddr,
 			.sport		= sport,
 			.dport		= dport,
+			.ingress_ifindex	= ifindex,
 		};
 		u32 act;
 
-- 
cgit v1.2.3


From 68dbbe7d5b4fde736d104cbbc9a2fce875562012 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Thu, 4 Nov 2021 17:31:58 +0900
Subject: libata: fix read log timeout value

Some ATA drives are very slow to respond to READ_LOG_EXT and
READ_LOG_DMA_EXT commands issued from ata_dev_configure() when the
device is revalidated right after resuming a system or inserting the
ATA adapter driver (e.g. ahci). The default 5s timeout
(ATA_EH_CMD_DFL_TIMEOUT) used for these commands is too short, causing
errors during the device configuration. Ex:

...
ata9: SATA max UDMA/133 abar m524288@0x9d200000 port 0x9d200400 irq 209
ata9: SATA link up 6.0 Gbps (SStatus 133 SControl 300)
ata9.00: ATA-9: XXX  XXXXXXXXXXXXXXX, XXXXXXXX, max UDMA/133
ata9.00: qc timeout (cmd 0x2f)
ata9.00: Read log page 0x00 failed, Emask 0x4
ata9.00: Read log page 0x00 failed, Emask 0x40
ata9.00: NCQ Send/Recv Log not supported
ata9.00: Read log page 0x08 failed, Emask 0x40
ata9.00: 27344764928 sectors, multi 16: LBA48 NCQ (depth 32), AA
ata9.00: Read log page 0x00 failed, Emask 0x40
ata9.00: ATA Identify Device Log not supported
ata9.00: failed to set xfermode (err_mask=0x40)
ata9: SATA link up 6.0 Gbps (SStatus 133 SControl 300)
ata9.00: configured for UDMA/133
...

The timeout error causes a soft reset of the drive link, followed in
most cases by a successful revalidation as that give enough time to the
drive to become fully ready to quickly process the read log commands.
However, in some cases, this also fails resulting in the device being
dropped.

Fix this by using adding the ata_eh_revalidate_timeouts entries for the
READ_LOG_EXT and READ_LOG_DMA_EXT commands. This defines a timeout
increased to 15s, retriable one time.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2884383f1718..5331557316e8 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -394,7 +394,7 @@ enum {
 	/* This should match the actual table size of
 	 * ata_eh_cmd_timeout_table in libata-eh.c.
 	 */
-	ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 6,
+	ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 7,
 
 	/* Horkage types. May be set by libata or controller on drives
 	   (some horkage may be drive/controller pair dependent */
-- 
cgit v1.2.3


From a25efb3863d068929f0bbeb87a995df11507e691 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Thu, 23 Sep 2021 13:57:42 +0200
Subject: dma-buf: add dma_fence_describe and dma_resv_describe v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add functions to dump dma_fence and dma_resv objects into a seq_file and
use them for printing the debugfs information.

v2: fix missing include reported by test robot.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Rob Clark <robdclark@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211103081231.18578-2-christian.koenig@amd.com
---
 include/linux/dma-fence.h | 1 +
 include/linux/dma-resv.h  | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index a706b7bf51d7..1ea691753bd3 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -264,6 +264,7 @@ void dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops,
 
 void dma_fence_release(struct kref *kref);
 void dma_fence_free(struct dma_fence *fence);
+void dma_fence_describe(struct dma_fence *fence, struct seq_file *seq);
 
 /**
  * dma_fence_put - decreases refcount of the fence
diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index dbd235ab447f..09c6063b199a 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -490,5 +490,6 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src);
 long dma_resv_wait_timeout(struct dma_resv *obj, bool wait_all, bool intr,
 			   unsigned long timeout);
 bool dma_resv_test_signaled(struct dma_resv *obj, bool test_all);
+void dma_resv_describe(struct dma_resv *obj, struct seq_file *seq);
 
 #endif /* _LINUX_RESERVATION_H */
-- 
cgit v1.2.3


From 9c8e9c9681a0f3f1ae90a90230d059c7a1dece5a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Nov 2021 00:27:29 +0100
Subject: PCI/MSI: Move non-mask check back into low level accessors

The recent rework of PCI/MSI[X] masking moved the non-mask checks from the
low level accessors into the higher level mask/unmask functions.

This missed the fact that these accessors can be invoked from other places
as well. The missing checks break XEN-PV which sets pci_msi_ignore_mask and
also violates the virtual MSIX and the msi_attrib.maskbit protections.

Instead of sprinkling checks all over the place, lift them back into the
low level accessor functions. To avoid checking three different conditions
combine them into one property of msi_desc::msi_attrib.

[ josef: Fixed the missed conversion in the core code ]

Fixes: fcacdfbef5a1 ("PCI/MSI: Provide a new set of mask and unmask functions")
Reported-by: Josef Johansson <josef@oderland.se>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Josef Johansson <josef@oderland.se>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: stable@vger.kernel.org
---
 include/linux/msi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 49cf6eb222e7..e616f94c7c58 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -148,7 +148,7 @@ struct msi_desc {
 				u8	is_msix		: 1;
 				u8	multiple	: 3;
 				u8	multi_cap	: 3;
-				u8	maskbit		: 1;
+				u8	can_mask	: 1;
 				u8	is_64		: 1;
 				u8	is_virtual	: 1;
 				u16	entry_nr;
-- 
cgit v1.2.3


From 2226667a145db2e1f314d7f57fd644fe69863ab9 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 4 Nov 2021 18:01:29 +0000
Subject: PCI/MSI: Deal with devices lying about their MSI mask capability

It appears that some devices are lying about their mask capability,
pretending that they don't have it, while they actually do.
The net result is that now that we don't enable MSIs on such
endpoint.

Add a new per-device flag to deal with this. Further patches will
make use of it, sadly.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20211104180130.3825416-2-maz@kernel.org
Cc: Bjorn Helgaas <helgaas@kernel.org>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index c8afbee5da4b..d0dba7fd9848 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -233,6 +233,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10),
 	/* Don't use Relaxed Ordering for TLPs directed at this device */
 	PCI_DEV_FLAGS_NO_RELAXED_ORDERING = (__force pci_dev_flags_t) (1 << 11),
+	/* Device does honor MSI masking despite saying otherwise */
+	PCI_DEV_FLAGS_HAS_MSI_MASKING = (__force pci_dev_flags_t) (1 << 12),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3


From a8b76910e465d718effce0cad306a21fa4f3526b Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Wed, 10 Nov 2021 20:24:44 +0000
Subject: preempt: Restore preemption model selection configs

Commit c597bfddc9e9 ("sched: Provide Kconfig support for default dynamic
preempt mode") changed the selectable config names for the preemption
model. This means a config file must now select

  CONFIG_PREEMPT_BEHAVIOUR=y

rather than

  CONFIG_PREEMPT=y

to get a preemptible kernel. This means all arch config files would need to
be updated - right now they'll all end up with the default
CONFIG_PREEMPT_NONE_BEHAVIOUR.

Rather than touch a good hundred of config files, restore usage of
CONFIG_PREEMPT{_NONE, _VOLUNTARY}. Make them configure:
o The build-time preemption model when !PREEMPT_DYNAMIC
o The default boot-time preemption model when PREEMPT_DYNAMIC

Add siblings of those configs with the _BUILD suffix to unconditionally
designate the build-time preemption model (PREEMPT_DYNAMIC is built with
the "highest" preemption model it supports, aka PREEMPT). Downstream
configs should by now all be depending / selected by CONFIG_PREEMPTION
rather than CONFIG_PREEMPT, so only a few sites need patching up.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/r/20211110202448.4054153-2-valentin.schneider@arm.com
---
 include/linux/kernel.h   | 2 +-
 include/linux/vermagic.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 968b4c4fe65b..77755ac3e189 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -85,7 +85,7 @@
 struct completion;
 struct user;
 
-#ifdef CONFIG_PREEMPT_VOLUNTARY
+#ifdef CONFIG_PREEMPT_VOLUNTARY_BUILD
 
 extern int __cond_resched(void);
 # define might_resched() __cond_resched()
diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h
index 1eaaa93c37bf..329d63babaeb 100644
--- a/include/linux/vermagic.h
+++ b/include/linux/vermagic.h
@@ -15,7 +15,7 @@
 #else
 #define MODULE_VERMAGIC_SMP ""
 #endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_BUILD
 #define MODULE_VERMAGIC_PREEMPT "preempt "
 #elif defined(CONFIG_PREEMPT_RT)
 #define MODULE_VERMAGIC_PREEMPT "preempt_rt "
-- 
cgit v1.2.3


From 2f70ddb1f71814aae525c58086fcb2f6974e6591 Mon Sep 17 00:00:00 2001
From: Ashish Kalra <ashish.kalra@amd.com>
Date: Tue, 24 Aug 2021 11:06:40 +0000
Subject: EFI: Introduce the new AMD Memory Encryption GUID.

Introduce a new AMD Memory Encryption GUID which is currently
used for defining a new UEFI environment variable which indicates
UEFI/OVMF support for the SEV live migration feature. This variable
is setup when UEFI/OVMF detects host/hypervisor support for SEV
live migration and later this variable is read by the kernel using
EFI runtime services to verify if OVMF supports the live migration
feature.

Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Message-Id: <1cea22976d2208f34d47e0c1ce0ecac816c13111.1629726117.git.ashish.kalra@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/efi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 6b5d36babfcc..dbd39b20e034 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -362,6 +362,7 @@ void efi_native_runtime_setup(void);
 
 /* OEM GUIDs */
 #define DELLEMC_EFI_RCI2_TABLE_GUID		EFI_GUID(0x2d9f28a2, 0xa886, 0x456a,  0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55)
+#define AMD_SEV_MEM_ENCRYPT_GUID		EFI_GUID(0x0cf29b71, 0x9e51, 0x433a,  0xa3, 0xb7, 0x81, 0xf3, 0xab, 0x16, 0xb8, 0x75)
 
 typedef struct {
 	efi_guid_t guid;
-- 
cgit v1.2.3


From f4d316537059b274452727e86f46ff3bdefdde4d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 11 Nov 2021 10:13:38 -0500
Subject: KVM: generalize "bugged" VM to "dead" VM

Generalize KVM_REQ_VM_BUGGED so that it can be called even in cases
where it is by design that the VM cannot be operated upon.  In this
case any KVM_BUG_ON should still warn, so introduce a new flag
kvm->vm_dead that is separate from kvm->vm_bugged.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 60a35d9fe259..9e0667e3723e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -150,7 +150,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_MMU_RELOAD        (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_UNBLOCK           2
 #define KVM_REQ_UNHALT            3
-#define KVM_REQ_VM_BUGGED         (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_VM_DEAD           (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQUEST_ARCH_BASE     8
 
 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
@@ -617,6 +617,7 @@ struct kvm {
 	unsigned int max_halt_poll_ns;
 	u32 dirty_ring_size;
 	bool vm_bugged;
+	bool vm_dead;
 
 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 	struct notifier_block pm_notifier;
@@ -650,12 +651,19 @@ struct kvm {
 #define vcpu_err(vcpu, fmt, ...)					\
 	kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
 
+static inline void kvm_vm_dead(struct kvm *kvm)
+{
+	kvm->vm_dead = true;
+	kvm_make_all_cpus_request(kvm, KVM_REQ_VM_DEAD);
+}
+
 static inline void kvm_vm_bugged(struct kvm *kvm)
 {
 	kvm->vm_bugged = true;
-	kvm_make_all_cpus_request(kvm, KVM_REQ_VM_BUGGED);
+	kvm_vm_dead(kvm);
 }
 
+
 #define KVM_BUG(cond, kvm, fmt...)				\
 ({								\
 	int __ret = (cond);					\
-- 
cgit v1.2.3


From 0093de693fe7baf31641d1863793e6db93a43b6e Mon Sep 17 00:00:00 2001
From: Yixuan Cao <caoyixuan2019@email.szu.edu.cn>
Date: Wed, 10 Nov 2021 20:32:30 -0800
Subject: mm/page_owner.c: modify the type of argument "order" in some
 functions

The type of "order" in struct page_owner is unsigned short.
However, it is unsigned int in the following 3 functions:

  __reset_page_owner
  __set_page_owner_handle
  __set_page_owner_handle

The type of "order" in argument list is unsigned int, which is
inconsistent.

[akpm@linux-foundation.org: update include/linux/page_owner.h]

Link: https://lkml.kernel.org/r/20211020125945.47792-1-caoyixuan2019@email.szu.edu.cn
Signed-off-by: Yixuan Cao <caoyixuan2019@email.szu.edu.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_owner.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 43c638c51c1f..119a0c9d2a8b 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -8,9 +8,9 @@
 extern struct static_key_false page_owner_inited;
 extern struct page_ext_operations page_owner_ops;
 
-extern void __reset_page_owner(struct page *page, unsigned int order);
+extern void __reset_page_owner(struct page *page, unsigned short order);
 extern void __set_page_owner(struct page *page,
-			unsigned int order, gfp_t gfp_mask);
+			unsigned short order, gfp_t gfp_mask);
 extern void __split_page_owner(struct page *page, unsigned int nr);
 extern void __folio_copy_owner(struct folio *newfolio, struct folio *old);
 extern void __set_page_owner_migrate_reason(struct page *page, int reason);
@@ -18,14 +18,14 @@ extern void __dump_page_owner(const struct page *page);
 extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 					pg_data_t *pgdat, struct zone *zone);
 
-static inline void reset_page_owner(struct page *page, unsigned int order)
+static inline void reset_page_owner(struct page *page, unsigned short order)
 {
 	if (static_branch_unlikely(&page_owner_inited))
 		__reset_page_owner(page, order);
 }
 
 static inline void set_page_owner(struct page *page,
-			unsigned int order, gfp_t gfp_mask)
+			unsigned short order, gfp_t gfp_mask)
 {
 	if (static_branch_unlikely(&page_owner_inited))
 		__set_page_owner(page, order, gfp_mask);
@@ -52,7 +52,7 @@ static inline void dump_page_owner(const struct page *page)
 		__dump_page_owner(page);
 }
 #else
-static inline void reset_page_owner(struct page *page, unsigned int order)
+static inline void reset_page_owner(struct page *page, unsigned short order)
 {
 }
 static inline void set_page_owner(struct page *page,
@@ -60,7 +60,7 @@ static inline void set_page_owner(struct page *page,
 {
 }
 static inline void split_page_owner(struct page *page,
-			unsigned int order)
+			unsigned short order)
 {
 }
 static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio)
-- 
cgit v1.2.3


From ab09243aa95a72bac5c71e852773de34116f8d0f Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Wed, 10 Nov 2021 20:32:40 -0800
Subject: mm/migrate.c: remove MIGRATE_PFN_LOCKED

MIGRATE_PFN_LOCKED is used to indicate to migrate_vma_prepare() that a
source page was already locked during migrate_vma_collect().  If it
wasn't then the a second attempt is made to lock the page.  However if
the first attempt failed it's unlikely a second attempt will succeed,
and the retry adds complexity.  So clean this up by removing the retry
and MIGRATE_PFN_LOCKED flag.

Destination pages are also meant to have the MIGRATE_PFN_LOCKED flag
set, but nothing actually checks that.

Link: https://lkml.kernel.org/r/20211025041608.289017-1-apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ben Skeggs <bskeggs@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index eeb818c4fc78..4850cc5bf813 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -110,7 +110,6 @@ static inline int migrate_misplaced_page(struct page *page,
  */
 #define MIGRATE_PFN_VALID	(1UL << 0)
 #define MIGRATE_PFN_MIGRATE	(1UL << 1)
-#define MIGRATE_PFN_LOCKED	(1UL << 2)
 #define MIGRATE_PFN_WRITE	(1UL << 3)
 #define MIGRATE_PFN_SHIFT	6
 
-- 
cgit v1.2.3


From 68da4e0eaaab421f228eac57cbe7505b136464af Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 10 Nov 2021 12:01:14 -0600
Subject: Revert "PCI: Remove struct pci_dev->driver"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit b5f9c644eb1baafcd349ad134e2110773f8d0a38.

Revert b5f9c644eb1b ("PCI: Remove struct pci_dev->driver"), which is needed
to revert 2a4d9408c9e8 ("PCI: Use to_pci_driver() instead of
pci_dev->driver").

2a4d9408c9e8 caused a NULL pointer dereference reported by Robert Święcki.
Details in the revert of that commit.

Fixes: 2a4d9408c9e8 ("PCI: Use to_pci_driver() instead of pci_dev->driver")
Link: https://lore.kernel.org/linux-i2c/CAP145pgdrdiMAT7=-iB1DMgA7t_bMqTcJL4N0=6u8kNY3EU0dw@mail.gmail.com/
Reported-by: Robert Święcki <robert@swiecki.net>
Tested-by: Robert Święcki <robert@swiecki.net>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index b4dbcc86b3f1..e58888e21ab7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -342,6 +342,7 @@ struct pci_dev {
 	u16		pcie_flags_reg;	/* Cached PCIe Capabilities Register */
 	unsigned long	*dma_alias_mask;/* Mask of enabled devfn aliases */
 
+	struct pci_driver *driver;	/* Driver bound to this device */
 	u64		dma_mask;	/* Mask of the bits of bus address this
 					   device implements.  Normally this is
 					   0xffffffff.  You only need to change
-- 
cgit v1.2.3


From bf9167a8b40c9cf463521da05342db81808c1b6e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 11 Nov 2021 09:56:33 +0100
Subject: HID: intel-ish-hid: fix module device-id handling

A late addititon to the intel-ish-hid framework caused a build failure
with clang, and introduced an ABI to the module loader that stops working
if any driver ever needs to bind to more than one UUID:

drivers/hid/intel-ish-hid/ishtp-fw-loader.c:1067:4: error: initializer element is not a compile-time constant

Change the ishtp_device_id to have correct documentation and a driver_data
field like all the other ones, and change the drivers to use the ID table
as the primary identification in a way that works with all compilers
and avoids duplciating the identifiers.

Fixes: f155dfeaa4ee ("platform/x86: isthp_eclite: only load for matching devices")
Fixes: facfe0a4fdce ("platform/chrome: chros_ec_ishtp: only load for matching devices")
Fixes: 0d0cccc0fd83 ("HID: intel-ish-hid: hid-client: only load for matching devices")
Fixes: 44e2a58cb880 ("HID: intel-ish-hid: fw-loader: only load for matching devices")
Fixes: cb1a2c6847f7 ("HID: intel-ish-hid: use constants for modaliases")
Fixes: fa443bc3c1e4 ("HID: intel-ish-hid: add support for MODULE_DEVICE_TABLE()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
[jkosina@suse.cz: fix ecl_ishtp_cl_driver.id initialization]
[jkosina@suse.cz: fix conflict with already fixed kerneldoc]
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/intel-ish-client-if.h | 4 ++--
 include/linux/mod_devicetable.h     | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index aee8ff4739b1..f45f13304add 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -9,7 +9,7 @@
 #define _INTEL_ISH_CLIENT_IF_H_
 
 #include <linux/device.h>
-#include <linux/uuid.h>
+#include <linux/mod_devicetable.h>
 
 struct ishtp_cl_device;
 struct ishtp_device;
@@ -40,7 +40,7 @@ enum cl_state {
 struct ishtp_cl_driver {
 	struct device_driver driver;
 	const char *name;
-	const guid_t *guid;
+	const struct ishtp_device_id *id;
 	int (*probe)(struct ishtp_cl_device *dev);
 	void (*remove)(struct ishtp_cl_device *dev);
 	int (*reset)(struct ishtp_cl_device *dev);
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index c70abe7aaef2..4bb71979a8fd 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -902,9 +902,11 @@ struct dfl_device_id {
 /**
  * struct ishtp_device_id - ISHTP device identifier
  * @guid: GUID of the device.
+ * @driver_data: pointer to driver specific data
  */
 struct ishtp_device_id {
 	guid_t guid;
+	kernel_ulong_t driver_data;
 };
 
 #endif /* LINUX_MOD_DEVICETABLE_H */
-- 
cgit v1.2.3


From 636f6e2af4fb916b9f1b432964294b6979c34002 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Tue, 9 Nov 2021 08:45:25 +0900
Subject: libata: add horkage for missing Identify Device log

ACS-3 introduced the ATA Identify Device Data log as mandatory. A
warning message currently signals to the user if a device does not
report supporting this log page in the log directory page, regardless
of the ATA version of the device. Furthermore, this warning will appear
for all attempts at accessing this missing log page during device
revalidation.

Since it is useless to constantly access the log directory and warn
about this lack of support once we have discovered that the device
does not support this log page, introduce the horkage flag
ATA_HORKAGE_NO_ID_DEV_LOG to mark a device as lacking support for
the Identify Device Data log page. Set this flag when
ata_log_supported() returns false in ata_identify_page_supported().
The warning is printed only if the device ATA level is 10 or above
(ACS-3 or above), and only once on device scan. With this flag set, the
log directory page is not accessed again to test for Identify Device
Data log page support.

Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/libata.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5331557316e8..2a8404b26083 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -427,6 +427,7 @@ enum {
 	ATA_HORKAGE_MAX_SEC_1024 = (1 << 25),	/* Limit max sects to 1024 */
 	ATA_HORKAGE_MAX_TRIM_128M = (1 << 26),	/* Limit max trim size to 128M */
 	ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27),	/* Disable NCQ on ATI chipset */
+	ATA_HORKAGE_NO_ID_DEV_LOG = (1 << 28),	/* Identify device log missing */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3


From 32a370abf12f82c8383e430c21365f5355d8b288 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 12 Nov 2021 12:07:02 -0500
Subject: net,lsm,selinux: revert the security_sctp_assoc_established() hook

This patch reverts two prior patches, e7310c94024c
("security: implement sctp_assoc_established hook in selinux") and
7c2ef0240e6a ("security: add sctp_assoc_established hook"), which
create the security_sctp_assoc_established() LSM hook and provide a
SELinux implementation.  Unfortunately these two patches were merged
without proper review (the Reviewed-by and Tested-by tags from
Richard Haines were for previous revisions of these patches that
were significantly different) and there are outstanding objections
from the SELinux maintainers regarding these patches.

Work is currently ongoing to correct the problems identified in the
reverted patches, as well as others that have come up during review,
but it is unclear at this point in time when that work will be ready
for inclusion in the mainline kernel.  In the interest of not keeping
objectionable code in the kernel for multiple weeks, and potentially
a kernel release, we are reverting the two problematic patches.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 --
 include/linux/lsm_hooks.h     | 5 -----
 include/linux/security.h      | 7 -------
 3 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 442a611fa0fb..df8de62f4710 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -335,8 +335,6 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname,
 	 struct sockaddr *address, int addrlen)
 LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc,
 	 struct sock *sk, struct sock *newsk)
-LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc,
-	 struct sk_buff *skb)
 #endif /* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index d6823214d5c1..d45b6f6e27fd 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1050,11 +1050,6 @@
  *	@asoc pointer to current sctp association structure.
  *	@sk pointer to current sock structure.
  *	@newsk pointer to new sock structure.
- * @sctp_assoc_established:
- *	Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet
- *	to the security module.
- *	@asoc pointer to sctp association structure.
- *	@skb pointer to skbuff of association packet.
  *
  * Security hooks for Infiniband
  *
diff --git a/include/linux/security.h b/include/linux/security.h
index 06eac4e61a13..bbf44a466832 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1430,8 +1430,6 @@ int security_sctp_bind_connect(struct sock *sk, int optname,
 			       struct sockaddr *address, int addrlen);
 void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 			    struct sock *newsk);
-void security_sctp_assoc_established(struct sctp_association *asoc,
-				     struct sk_buff *skb);
 
 #else	/* CONFIG_SECURITY_NETWORK */
 static inline int security_unix_stream_connect(struct sock *sock,
@@ -1651,11 +1649,6 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc,
 					  struct sock *newsk)
 {
 }
-
-static inline void security_sctp_assoc_established(struct sctp_association *asoc,
-						   struct sk_buff *skb)
-{
-}
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
-- 
cgit v1.2.3


From 9e2ad638ae3632ef916ceb39f70e3104bf8fdc97 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 12 Nov 2021 07:02:42 -0800
Subject: bpf: Extend BTF_ID_LIST_GLOBAL with parameter for number of IDs

syzbot reported the following BUG w/o CONFIG_DEBUG_INFO_BTF

BUG: KASAN: global-out-of-bounds in task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661
Read of size 4 at addr ffffffff90297404 by task swapper/0/1

CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.15.0-syzkaller #0
Hardware name: ... Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_address_description.constprop.0.cold+0xf/0x309 mm/kasan/report.c:256
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661
do_one_initcall+0x103/0x650 init/main.c:1295
do_initcall_level init/main.c:1368 [inline]
do_initcalls init/main.c:1384 [inline]
do_basic_setup init/main.c:1403 [inline]
kernel_init_freeable+0x6b1/0x73a init/main.c:1606
kernel_init+0x1a/0x1d0 init/main.c:1497
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
</TASK>

This is caused by hard-coded name[1] in BTF_ID_LIST_GLOBAL (w/o
CONFIG_DEBUG_INFO_BTF). Fix this by adding a parameter n to
BTF_ID_LIST_GLOBAL. This avoids ifdef CONFIG_DEBUG_INFO_BTF in btf.c and
filter.c.

Fixes: 7c7e3d31e785 ("bpf: Introduce helper bpf_find_vma")
Reported-by: syzbot+e0d81ec552a21d9071aa@syzkaller.appspotmail.com
Reported-by: Eric Dumazet <edumazet@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211112150243.1270987-2-songliubraving@fb.com
---
 include/linux/btf_ids.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 47d9abfbdb55..6bb42b785293 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -73,7 +73,7 @@ asm(							\
 __BTF_ID_LIST(name, local)				\
 extern u32 name[];
 
-#define BTF_ID_LIST_GLOBAL(name)			\
+#define BTF_ID_LIST_GLOBAL(name, n)			\
 __BTF_ID_LIST(name, globl)
 
 /* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with
@@ -83,7 +83,7 @@ __BTF_ID_LIST(name, globl)
 	BTF_ID_LIST(name) \
 	BTF_ID(prefix, typename)
 #define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) \
-	BTF_ID_LIST_GLOBAL(name) \
+	BTF_ID_LIST_GLOBAL(name, 1)			  \
 	BTF_ID(prefix, typename)
 
 /*
@@ -149,7 +149,7 @@ extern struct btf_id_set name;
 #define BTF_ID_LIST(name) static u32 name[5];
 #define BTF_ID(prefix, name)
 #define BTF_ID_UNUSED
-#define BTF_ID_LIST_GLOBAL(name) u32 name[1];
+#define BTF_ID_LIST_GLOBAL(name, n) u32 name[n];
 #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1];
 #define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1];
 #define BTF_SET_START(name) static struct btf_id_set name = { 0 };
-- 
cgit v1.2.3


From d19ddb476a539fd78ad1028ae13bb38506286931 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 12 Nov 2021 07:02:43 -0800
Subject: bpf: Introduce btf_tracing_ids

Similar to btf_sock_ids, btf_tracing_ids provides btf ID for task_struct,
file, and vm_area_struct via easy to understand format like
btf_tracing_ids[BTF_TRACING_TYPE_[TASK|file|VMA]].

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211112150243.1270987-3-songliubraving@fb.com
---
 include/linux/btf_ids.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 6bb42b785293..919c0fde1c51 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -189,6 +189,18 @@ MAX_BTF_SOCK_TYPE,
 extern u32 btf_sock_ids[];
 #endif
 
-extern u32 btf_task_struct_ids[];
+#define BTF_TRACING_TYPE_xxx	\
+	BTF_TRACING_TYPE(BTF_TRACING_TYPE_TASK, task_struct)	\
+	BTF_TRACING_TYPE(BTF_TRACING_TYPE_FILE, file)		\
+	BTF_TRACING_TYPE(BTF_TRACING_TYPE_VMA, vm_area_struct)
+
+enum {
+#define BTF_TRACING_TYPE(name, type) name,
+BTF_TRACING_TYPE_xxx
+#undef BTF_TRACING_TYPE
+MAX_BTF_TRACING_TYPE,
+};
+
+extern u32 btf_tracing_ids[];
 
 #endif
-- 
cgit v1.2.3


From 1aa3b2207e889a948049c9a8016cedb0218c2389 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 12 Nov 2021 18:18:10 -0500
Subject: net,lsm,selinux: revert the security_sctp_assoc_established() hook

This patch reverts two prior patches, e7310c94024c
("security: implement sctp_assoc_established hook in selinux") and
7c2ef0240e6a ("security: add sctp_assoc_established hook"), which
create the security_sctp_assoc_established() LSM hook and provide a
SELinux implementation.  Unfortunately these two patches were merged
without proper review (the Reviewed-by and Tested-by tags from
Richard Haines were for previous revisions of these patches that
were significantly different) and there are outstanding objections
from the SELinux maintainers regarding these patches.

Work is currently ongoing to correct the problems identified in the
reverted patches, as well as others that have come up during review,
but it is unclear at this point in time when that work will be ready
for inclusion in the mainline kernel.  In the interest of not keeping
objectionable code in the kernel for multiple weeks, and potentially
a kernel release, we are reverting the two problematic patches.

Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/lsm_hook_defs.h | 2 --
 include/linux/lsm_hooks.h     | 5 -----
 include/linux/security.h      | 7 -------
 3 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 442a611fa0fb..df8de62f4710 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -335,8 +335,6 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname,
 	 struct sockaddr *address, int addrlen)
 LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc,
 	 struct sock *sk, struct sock *newsk)
-LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc,
-	 struct sk_buff *skb)
 #endif /* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index d6823214d5c1..d45b6f6e27fd 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1050,11 +1050,6 @@
  *	@asoc pointer to current sctp association structure.
  *	@sk pointer to current sock structure.
  *	@newsk pointer to new sock structure.
- * @sctp_assoc_established:
- *	Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet
- *	to the security module.
- *	@asoc pointer to sctp association structure.
- *	@skb pointer to skbuff of association packet.
  *
  * Security hooks for Infiniband
  *
diff --git a/include/linux/security.h b/include/linux/security.h
index 06eac4e61a13..bbf44a466832 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1430,8 +1430,6 @@ int security_sctp_bind_connect(struct sock *sk, int optname,
 			       struct sockaddr *address, int addrlen);
 void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 			    struct sock *newsk);
-void security_sctp_assoc_established(struct sctp_association *asoc,
-				     struct sk_buff *skb);
 
 #else	/* CONFIG_SECURITY_NETWORK */
 static inline int security_unix_stream_connect(struct sock *sock,
@@ -1651,11 +1649,6 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc,
 					  struct sock *newsk)
 {
 }
-
-static inline void security_sctp_assoc_established(struct sctp_association *asoc,
-						   struct sk_buff *skb)
-{
-}
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
-- 
cgit v1.2.3


From 938aa33f14657c9ed9deea348b7d6f14b6d69cb7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Sun, 14 Nov 2021 13:28:34 -0500
Subject: tracing: Add length protection to histogram string copies

The string copies to the histogram storage has a max size of 256 bytes
(defined by MAX_FILTER_STR_VAL). Only the string size of the event field
needs to be copied to the event storage, but no more than what is in the
event storage. Although nothing should be bigger than 256 bytes, there's
no protection against overwriting of the storage if one day there is.

Copy no more than the destination size, and enforce it.

Also had to turn MAX_FILTER_STR_VAL into an unsigned int, to keep the
min() comparison of the string sizes of comparable types.

Link: https://lore.kernel.org/all/CAHk-=wjREUihCGrtRBwfX47y_KrLCGjiq3t6QtoNJpmVrAEb1w@mail.gmail.com/
Link: https://lkml.kernel.org/r/20211114132834.183429a4@rorschach.local.home

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Fixes: 63f84ae6b82b ("tracing/histogram: Do not copy the fixed-size char array field over the field size")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 50453b287615..2d167ac3452c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -673,7 +673,7 @@ struct trace_event_file {
 
 #define PERF_MAX_TRACE_SIZE	8192
 
-#define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
+#define MAX_FILTER_STR_VAL	256U	/* Should handle KSYM_SYMBOL_LEN */
 
 enum event_trigger_type {
 	ETT_NONE		= (0),
-- 
cgit v1.2.3


From 4c7924fb905b02323ff6d9d20f370892615dccfa Mon Sep 17 00:00:00 2001
From: Julien Massot <julien.massot@iot.bzh>
Date: Fri, 22 Oct 2021 14:21:01 +0200
Subject: soc: renesas: rcar-rst: Add support to set rproc boot address

R-Car Gen3 SoC series has a realtime processor, the boot
address of this processor can be set thanks to CR7BAR register
of the reset module.

Export this function so that it's possible to set the boot
address from a remoteproc driver.

Also drop the __initdata qualifier on rcar_rst_base,
since we will use this address later than init time.

Signed-off-by: Julien Massot <julien.massot@iot.bzh>
Link: https://lore.kernel.org/r/20211022122101.66998-1-julien.massot@iot.bzh
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/linux/soc/renesas/rcar-rst.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/renesas/rcar-rst.h b/include/linux/soc/renesas/rcar-rst.h
index 7899a5b8c247..1f1fe8bfaa76 100644
--- a/include/linux/soc/renesas/rcar-rst.h
+++ b/include/linux/soc/renesas/rcar-rst.h
@@ -4,8 +4,10 @@
 
 #ifdef CONFIG_RST_RCAR
 int rcar_rst_read_mode_pins(u32 *mode);
+int rcar_rst_set_rproc_boot_addr(u64 boot_addr);
 #else
 static inline int rcar_rst_read_mode_pins(u32 *mode) { return -ENODEV; }
+static inline int rcar_rst_set_rproc_boot_addr(u64 boot_addr) { return -ENODEV; }
 #endif
 
 #endif /* __LINUX_SOC_RENESAS_RCAR_RST_H__ */
-- 
cgit v1.2.3


From 032816fbbfafe3198bb5c71fbbe4e8e5be33b352 Mon Sep 17 00:00:00 2001
From: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Date: Wed, 27 Oct 2021 14:45:07 +0100
Subject: pinctrl: pinconf-generic: Add support for "output-impedance-ohms" to
 be extracted from DT files

Add "output-impedance-ohms" property to generic options used for DT
parsing files. This enables drivers, which use generic pin configurations,
to get the value passed to this property.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Biju Das <biju.das.jz@bp.renesas.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20211027134509.5036-3-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/linux/pinctrl/pinconf-generic.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index eee0e3948537..2422211d6a5a 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -91,6 +91,8 @@ struct pinctrl_map;
  * 	configuration (eg. the currently selected mux function) drive values on
  * 	the line. Use argument 1 to enable output mode, argument 0 to disable
  * 	it.
+ * @PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS: this will configure the output impedance
+ * 	of the pin with the value passed as argument. The argument is in ohms.
  * @PIN_CONFIG_PERSIST_STATE: retain pin state across sleep or controller reset
  * @PIN_CONFIG_POWER_SOURCE: if the pin can select between different power
  *	supplies, the argument to this parameter (on a custom format) tells
@@ -129,6 +131,7 @@ enum pin_config_param {
 	PIN_CONFIG_MODE_PWM,
 	PIN_CONFIG_OUTPUT,
 	PIN_CONFIG_OUTPUT_ENABLE,
+	PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS,
 	PIN_CONFIG_PERSIST_STATE,
 	PIN_CONFIG_POWER_SOURCE,
 	PIN_CONFIG_SKEW_DELAY,
-- 
cgit v1.2.3


From 507805b83ff108473dba9d4909e41abd50cf07f5 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 10 Nov 2021 15:47:42 +0200
Subject: gpiolib: acpi: Remove never used devm_acpi_dev_remove_driver_gpios()

Remove never used devm_acpi_dev_remove_driver_gpios().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 include/linux/gpio/consumer.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 97a28ad3393b..3ad67b4a72be 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -690,7 +690,6 @@ void acpi_dev_remove_driver_gpios(struct acpi_device *adev);
 
 int devm_acpi_dev_add_driver_gpios(struct device *dev,
 				   const struct acpi_gpio_mapping *gpios);
-void devm_acpi_dev_remove_driver_gpios(struct device *dev);
 
 struct gpio_desc *acpi_get_and_request_gpiod(char *path, int pin, char *label);
 
@@ -708,7 +707,6 @@ static inline int devm_acpi_dev_add_driver_gpios(struct device *dev,
 {
 	return -ENXIO;
 }
-static inline void devm_acpi_dev_remove_driver_gpios(struct device *dev) {}
 
 #endif /* CONFIG_GPIOLIB && CONFIG_ACPI */
 
-- 
cgit v1.2.3


From 10a2308ffb8cf262e473eb324fde42ae31b6da04 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Fri, 12 Nov 2021 18:16:34 +0800
Subject: net: Clean up some inconsistent indenting

Eliminate the follow smatch warning:

./include/linux/skbuff.h:4229 skb_remcsum_process() warn: inconsistent
indenting.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 686a666d073d..c8cb7e697d47 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4226,7 +4226,7 @@ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
 		return;
 	}
 
-	 if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
+	if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
 		__skb_checksum_complete(skb);
 		skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
 	}
-- 
cgit v1.2.3


From 02d6fdecb9c38de19065f6bed8d5214556fd061d Mon Sep 17 00:00:00 2001
From: Ansuel Smith <ansuelsmth@gmail.com>
Date: Thu, 4 Nov 2021 16:00:40 +0100
Subject: regmap: allow to define reg_update_bits for no bus configuration

Some device requires a special handling for reg_update_bits and can't use
the normal regmap read write logic. An example is when locking is
handled by the device and rmw operations requires to do atomic operations.
Allow to declare a dedicated function in regmap_config for
reg_update_bits in no bus configuration.

Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
Link: https://lore.kernel.org/r/20211104150040.1260-1-ansuelsmth@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index e3c9a25a853a..22652e5fbc38 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -290,6 +290,11 @@ typedef void (*regmap_unlock)(void *);
  *		  read operation on a bus such as SPI, I2C, etc. Most of the
  *		  devices do not need this.
  * @reg_write:	  Same as above for writing.
+ * @reg_update_bits: Optional callback that if filled will be used to perform
+ *		     all the update_bits(rmw) operation. Should only be provided
+ *		     if the function require special handling with lock and reg
+ *		     handling and the operation cannot be represented as a simple
+ *		     update_bits operation on a bus such as SPI, I2C, etc.
  * @fast_io:	  Register IO is fast. Use a spinlock instead of a mutex
  *	     	  to perform locking. This field is ignored if custom lock/unlock
  *	     	  functions are used (see fields lock/unlock of struct regmap_config).
@@ -372,6 +377,8 @@ struct regmap_config {
 
 	int (*reg_read)(void *context, unsigned int reg, unsigned int *val);
 	int (*reg_write)(void *context, unsigned int reg, unsigned int val);
+	int (*reg_update_bits)(void *context, unsigned int reg,
+			       unsigned int mask, unsigned int val);
 
 	bool fast_io;
 
-- 
cgit v1.2.3


From 45971bdd8ca8b5a99a49f4db86737401c45e246f Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 2 Nov 2021 16:02:02 -0600
Subject: spi: remove unused header file <linux/platform_data/spi-clps711x.h>

Commit 6acaadc852f1 ("spi: clps711x: Driver refactor") removed the only use
of <linux/platform_data/spi-clps711x.h>, but left the header file behind.
This file is unused, delete it.

Cc: Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20211102220203.940290-9-corbet@lwn.net
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/platform_data/spi-clps711x.h | 17 -----------------
 1 file changed, 17 deletions(-)
 delete mode 100644 include/linux/platform_data/spi-clps711x.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/spi-clps711x.h b/include/linux/platform_data/spi-clps711x.h
deleted file mode 100644
index efaa596848c9..000000000000
--- a/include/linux/platform_data/spi-clps711x.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- *  CLPS711X SPI bus driver definitions
- *
- *  Copyright (C) 2012 Alexander Shiyan <shc_work@mail.ru>
- */
-
-#ifndef ____LINUX_PLATFORM_DATA_SPI_CLPS711X_H
-#define ____LINUX_PLATFORM_DATA_SPI_CLPS711X_H
-
-/* Board specific platform_data */
-struct spi_clps711x_pdata {
-	int *chipselect;	/* Array of GPIO-numbers */
-	int num_chipselect;	/* Total count of GPIOs */
-};
-
-#endif
-- 
cgit v1.2.3


From a0ddee65c527d877e798205c1391c6170e580c66 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 12 Nov 2021 16:07:49 +0200
Subject: printk: Remove printk.h inclusion in percpu.h

After the commit 42a0bb3f7138 ("printk/nmi: generic solution for safe
printk in NMI") the printk.h is not needed anymore in percpu.h.

Moreover `make headerdep` complains (an excerpt)

In file included from linux/printk.h,
                 from linux/dynamic_debug.h:188
                 from linux/printk.h:559 <-- here
                 from linux/percpu.h:9
                 from linux/idr.h:17
include/net/9p/client.h:13: warning: recursive header inclusion

Yeah, it's not a root cause of this, but removing will help to reduce
the noise.

Fixes: 42a0bb3f7138 ("printk/nmi: generic solution for safe printk in NMI")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20211112140749.80042-1-andriy.shevchenko@linux.intel.com
---
 include/linux/percpu.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 5e76af742c80..4fa3000f9c22 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -6,7 +6,6 @@
 #include <linux/preempt.h>
 #include <linux/smp.h>
 #include <linux/cpumask.h>
-#include <linux/printk.h>
 #include <linux/pfn.h>
 #include <linux/init.h>
 
-- 
cgit v1.2.3


From 13cae4a104d2b7205696229ba85d34cc035f8c84 Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Mon, 15 Nov 2021 10:49:21 +0800
Subject: i2c: core: Allow 255 byte transfers for SMBus 3.x

SMBus 3.0 increased the maximum block transfer size from 32 bytes to
255 bytes. We increase the size of struct i2c_smbus_data's block[]
member.

i2c_smbus_xfer() and i2c_smbus_xfer_emulated() now support 255 byte
block operations, other block functions remain limited to 32 bytes for
compatibility with existing callers.

We allow adapters to indicate support for the larger size with
I2C_FUNC_SMBUS_V3_BLOCK. Most emulated drivers should be able to use 255
byte blocks by replacing I2C_SMBUS_BLOCK_MAX with I2C_SMBUS_V3_BLOCK_MAX
though some will have hardware limitations that need testing.

Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/i2c.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 16119ac1aa97..353d6b4e7a53 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -52,6 +52,19 @@ typedef int (*i2c_slave_cb_t)(struct i2c_client *client,
 struct module;
 struct property_entry;
 
+/* SMBus 3.0 extends the maximum block read/write size to 255 (from 32).
+ * The larger size is only supported by some drivers, indicated by
+ * the I2C_FUNC_SMBUS_V3_BLOCK functionality bit.
+ */
+#define I2C_SMBUS_V3_BLOCK_MAX	255	/* As specified in SMBus 3.0 standard */
+
+/* Note compatibility definition in uapi header with 32 byte block */
+union i2c_smbus_data {
+	__u8 byte;
+	__u16 word;
+	__u8 block[I2C_SMBUS_V3_BLOCK_MAX + 1]; /* block[0] is used for length */
+};
+
 #if IS_ENABLED(CONFIG_I2C)
 /* Return the Frequency mode string based on the bus frequency */
 const char *i2c_freq_mode_string(u32 bus_freq_hz);
-- 
cgit v1.2.3


From 34ae2c09d46a2d0abd907e139b466f798e4095a8 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 15 Nov 2021 10:00:27 +0000
Subject: net: phylink: add generic validate implementation

Add a generic validate() implementation using the supported_interfaces
and a bitmask of MAC pause/speed/duplex capabilities. This allows us
to entirely eliminate many driver private validate() implementations.

We expose the underlying phylink_get_linkmodes() function so that
drivers which have special needs can still benefit from conversion.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index f037470b6fb3..3563820a1765 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -20,6 +20,29 @@ enum {
 	MLO_AN_PHY = 0,	/* Conventional PHY */
 	MLO_AN_FIXED,	/* Fixed-link mode */
 	MLO_AN_INBAND,	/* In-band protocol */
+
+	MAC_SYM_PAUSE	= BIT(0),
+	MAC_ASYM_PAUSE	= BIT(1),
+	MAC_10HD	= BIT(2),
+	MAC_10FD	= BIT(3),
+	MAC_10		= MAC_10HD | MAC_10FD,
+	MAC_100HD	= BIT(4),
+	MAC_100FD	= BIT(5),
+	MAC_100		= MAC_100HD | MAC_100FD,
+	MAC_1000HD	= BIT(6),
+	MAC_1000FD	= BIT(7),
+	MAC_1000	= MAC_1000HD | MAC_1000FD,
+	MAC_2500FD	= BIT(8),
+	MAC_5000FD	= BIT(9),
+	MAC_10000FD	= BIT(10),
+	MAC_20000FD	= BIT(11),
+	MAC_25000FD	= BIT(12),
+	MAC_40000FD	= BIT(13),
+	MAC_50000FD	= BIT(14),
+	MAC_56000FD	= BIT(15),
+	MAC_100000FD	= BIT(16),
+	MAC_200000FD	= BIT(17),
+	MAC_400000FD	= BIT(18),
 };
 
 static inline bool phylink_autoneg_inband(unsigned int mode)
@@ -69,6 +92,7 @@ enum phylink_op_type {
  *		     if MAC link is at %MLO_AN_FIXED mode.
  * @supported_interfaces: bitmap describing which PHY_INTERFACE_MODE_xxx
  *                        are supported by the MAC/PCS.
+ * @mac_capabilities: MAC pause/speed/duplex capabilities.
  */
 struct phylink_config {
 	struct device *dev;
@@ -79,6 +103,7 @@ struct phylink_config {
 	void (*get_fixed_state)(struct phylink_config *config,
 				struct phylink_link_state *state);
 	DECLARE_PHY_INTERFACE_MASK(supported_interfaces);
+	unsigned long mac_capabilities;
 };
 
 /**
@@ -442,6 +467,12 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
 		 phy_interface_t interface, int speed, int duplex);
 #endif
 
+void phylink_get_linkmodes(unsigned long *linkmodes, phy_interface_t interface,
+			   unsigned long mac_capabilities);
+void phylink_generic_validate(struct phylink_config *config,
+			      unsigned long *supported,
+			      struct phylink_link_state *state);
+
 struct phylink *phylink_create(struct phylink_config *, struct fwnode_handle *,
 			       phy_interface_t iface,
 			       const struct phylink_mac_ops *mac_ops);
-- 
cgit v1.2.3


From 2f6a470d6545841cf1891b87e360d3998ef024c8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 15 Nov 2021 07:49:46 -0800
Subject: Revert "Merge branch 'mctp-i2c-driver'"

This reverts commit 71812af7234f30362b43ccff33f93890ae4c0655, reversing
changes made to cc0be1ad686fb29a4d127948486f40b17fb34b50.

Wolfram Sang says:

Please revert. Besides the driver in net, it modifies the I2C core
code. This has not been acked by the I2C maintainer (in this case me).
So, please don't pull this in via the net tree. The question raised here
(extending SMBus calls to 255 byte) is complicated because we need ABI
backwards compatibility.

Link: https://lore.kernel.org/all/YZJ9H4eM%2FM7OXVN0@shikoro/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/i2c.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 353d6b4e7a53..16119ac1aa97 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -52,19 +52,6 @@ typedef int (*i2c_slave_cb_t)(struct i2c_client *client,
 struct module;
 struct property_entry;
 
-/* SMBus 3.0 extends the maximum block read/write size to 255 (from 32).
- * The larger size is only supported by some drivers, indicated by
- * the I2C_FUNC_SMBUS_V3_BLOCK functionality bit.
- */
-#define I2C_SMBUS_V3_BLOCK_MAX	255	/* As specified in SMBus 3.0 standard */
-
-/* Note compatibility definition in uapi header with 32 byte block */
-union i2c_smbus_data {
-	__u8 byte;
-	__u16 word;
-	__u8 block[I2C_SMBUS_V3_BLOCK_MAX + 1]; /* block[0] is used for length */
-};
-
 #if IS_ENABLED(CONFIG_I2C)
 /* Return the Frequency mode string based on the bus frequency */
 const char *i2c_freq_mode_string(u32 bus_freq_hz);
-- 
cgit v1.2.3


From f64bd790b750dd281406964af40d16adfc88a074 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 29 Oct 2021 12:51:32 -0700
Subject: ACPI: Keep sub-table parsing infrastructure available for modules

The NFIT driver and now the CXL ACPI driver have both open-coded ACPI
table parsing. Before another instance is added arrange for the core
ACPI sub-table parsing to be optionally available to drivers via the
CONFIG_ACPI_TABLE_LIB symbol. If no drivers select the symbol then the
infrastructure reverts back to being tagged __init via the
__init_or_acpilib annotation.

For now, only tag the core sub-table routines and data that the CEDT parsing in
the cxl_acpi driver would want to reuse, a CEDT parsing helper is added
in a later change.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Alison Schofield <alison.schofield@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/163553709227.2509508.8215196520233473814.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/acpi.h | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 143ce7e0bee1..edfa3c8f3562 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -232,14 +232,22 @@ int acpi_locate_initial_tables (void);
 void acpi_reserve_initial_tables (void);
 void acpi_table_init_complete (void);
 int acpi_table_init (void);
+
+#ifdef CONFIG_ACPI_TABLE_LIB
+#define __init_or_acpilib
+#define __initdata_or_acpilib
+#else
+#define __init_or_acpilib __init
+#define __initdata_or_acpilib __initdata
+#endif
+
 int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
-int __init acpi_table_parse_entries(char *id, unsigned long table_size,
-			      int entry_id,
-			      acpi_tbl_entry_handler handler,
-			      unsigned int max_entries);
-int __init acpi_table_parse_entries_array(char *id, unsigned long table_size,
-			      struct acpi_subtable_proc *proc, int proc_num,
-			      unsigned int max_entries);
+int __init_or_acpilib acpi_table_parse_entries(char *id,
+		unsigned long table_size, int entry_id,
+		acpi_tbl_entry_handler handler, unsigned int max_entries);
+int __init_or_acpilib acpi_table_parse_entries_array(char *id,
+		unsigned long table_size, struct acpi_subtable_proc *proc,
+		int proc_num, unsigned int max_entries);
 int acpi_table_parse_madt(enum acpi_madt_type id,
 			  acpi_tbl_entry_handler handler,
 			  unsigned int max_entries);
-- 
cgit v1.2.3


From ad2f63971e9655e3987db32dac85aa50658790eb Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 29 Oct 2021 12:51:37 -0700
Subject: ACPI: Teach ACPI table parsing about the CEDT header format

The CEDT adds yet one more unique subtable header type where the length
is a 16-bit value. Extend the subtable helpers to detect this scenario.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Len Brown <lenb@kernel.org>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/163553709742.2509508.5177761945441327574.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/acpi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index edfa3c8f3562..6b7f181d51e2 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -133,6 +133,7 @@ union acpi_subtable_headers {
 	struct acpi_subtable_header common;
 	struct acpi_hmat_structure hmat;
 	struct acpi_prmt_module_header prmt;
+	struct acpi_cedt_header cedt;
 };
 
 typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);
-- 
cgit v1.2.3


From 2d03e46a4bad20191d07b83ec1242d5f002577be Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 29 Oct 2021 12:51:42 -0700
Subject: ACPI: Add a context argument for table parsing handlers

In preparation for drivers reusing the core table parsing
infrastructure, arrange for handlers to take a context argument. This
allows driver table parsing to wrap ACPI table entries in
driver-specific data.

The first consumer of this infrastructure is the CEDT parsing that
happens in the cxl_acpi driver, add a conditional
(CONFIG_ACPI_TABLE_LIB=y) export of a acpi_table_parse_cedt() helper for
this case.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Len Brown <lenb@kernel.org>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/163553710257.2509508.14310494417463866020.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/acpi.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6b7f181d51e2..95f88108f664 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -141,6 +141,9 @@ typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);
 typedef int (*acpi_tbl_entry_handler)(union acpi_subtable_headers *header,
 				      const unsigned long end);
 
+typedef int (*acpi_tbl_entry_handler_arg)(union acpi_subtable_headers *header,
+					  void *arg, const unsigned long end);
+
 /* Debugger support */
 
 struct acpi_debugger_ops {
@@ -217,6 +220,8 @@ static inline int acpi_debugger_notify_command_complete(void)
 struct acpi_subtable_proc {
 	int id;
 	acpi_tbl_entry_handler handler;
+	acpi_tbl_entry_handler_arg handler_arg;
+	void *arg;
 	int count;
 };
 
@@ -235,9 +240,11 @@ void acpi_table_init_complete (void);
 int acpi_table_init (void);
 
 #ifdef CONFIG_ACPI_TABLE_LIB
+#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI)
 #define __init_or_acpilib
 #define __initdata_or_acpilib
 #else
+#define EXPORT_SYMBOL_ACPI_LIB(x)
 #define __init_or_acpilib __init
 #define __initdata_or_acpilib __initdata
 #endif
@@ -252,6 +259,10 @@ int __init_or_acpilib acpi_table_parse_entries_array(char *id,
 int acpi_table_parse_madt(enum acpi_madt_type id,
 			  acpi_tbl_entry_handler handler,
 			  unsigned int max_entries);
+int __init_or_acpilib
+acpi_table_parse_cedt(enum acpi_cedt_type id,
+		      acpi_tbl_entry_handler_arg handler_arg, void *arg);
+
 int acpi_parse_mcfg (struct acpi_table_header *header);
 void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
 
-- 
cgit v1.2.3


From 03b122da74b22fbe7cd98184fa5657a9ce13970c Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 26 Oct 2021 15:00:48 -0700
Subject: x86/sgx: Hook arch_memory_failure() into mainline code

Add a call inside memory_failure() to call the arch specific code
to check if the address is an SGX EPC page and handle it.

Note the SGX EPC pages do not have a "struct page" entry, so the hook
goes in at the same point as the device mapping hook.

Pull the call to acquire the mutex earlier so the SGX errors are also
protected.

Make set_mce_nospec() skip SGX pages when trying to adjust
the 1:1 map.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Tested-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20211026220050.697075-6-tony.luck@intel.com
---
 include/linux/mm.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a7e4a9e7d807..57f1aa2a33b6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3231,6 +3231,19 @@ extern void shake_page(struct page *p);
 extern atomic_long_t num_poisoned_pages __read_mostly;
 extern int soft_offline_page(unsigned long pfn, int flags);
 
+#ifndef arch_memory_failure
+static inline int arch_memory_failure(unsigned long pfn, int flags)
+{
+	return -ENXIO;
+}
+#endif
+
+#ifndef arch_is_platform_page
+static inline bool arch_is_platform_page(u64 paddr)
+{
+	return false;
+}
+#endif
 
 /*
  * Error handlers for various types of pages.
-- 
cgit v1.2.3


From 749303055b78bc38ec0790ccc596cae235446367 Mon Sep 17 00:00:00 2001
From: Simon Trimmer <simont@opensource.cirrus.com>
Date: Mon, 15 Nov 2021 12:02:15 +0000
Subject: firmware: cs_dsp: tidy includes in cs_dsp.c and cs_dsp.h

This patch removes unused included header files and moves others into
cs_dsp.h to ensure that types referenced in the header file are properly
described to prevent compiler warnings.

Signed-off-by: Simon Trimmer <simont@opensource.cirrus.com>
Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20211115120215.56824-1-simont@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/firmware/cirrus/cs_dsp.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index 9ad9eaaaa552..3a54b1afc48f 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -11,6 +11,11 @@
 #ifndef __CS_DSP_H
 #define __CS_DSP_H
 
+#include <linux/device.h>
+#include <linux/firmware.h>
+#include <linux/list.h>
+#include <linux/regmap.h>
+
 #define CS_ADSP2_REGION_0 BIT(0)
 #define CS_ADSP2_REGION_1 BIT(1)
 #define CS_ADSP2_REGION_2 BIT(2)
-- 
cgit v1.2.3


From e9380df851878cee71df5a1c7611584421527f7e Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sun, 31 Oct 2021 20:48:52 -0500
Subject: ACPI: Add stubs for wakeup handler functions

The commit ddfd9dcf270c ("ACPI: PM: Add acpi_[un]register_wakeup_handler()")
added new functions for drivers to use during the s2idle wakeup path, but
didn't add stubs for when CONFIG_ACPI wasn't set.

Add those stubs in for other drivers to be able to use.

Fixes: ddfd9dcf270c ("ACPI: PM: Add acpi_[un]register_wakeup_handler()")
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Link: https://lore.kernel.org/r/20211101014853.6177-1-mario.limonciello@amd.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/acpi.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 143ce7e0bee1..668d007f0917 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -974,6 +974,15 @@ static inline int acpi_get_local_address(acpi_handle handle, u32 *addr)
 	return -ENODEV;
 }
 
+static inline int acpi_register_wakeup_handler(int wake_irq,
+	bool (*wakeup)(void *context), void *context)
+{
+	return -ENXIO;
+}
+
+static inline void acpi_unregister_wakeup_handler(
+	bool (*wakeup)(void *context), void *context) { }
+
 #endif	/* !CONFIG_ACPI */
 
 #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
-- 
cgit v1.2.3


From a3143f7822a9eeb38f0e046080ae8f79f6c7122d Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 2 Nov 2021 16:01:58 -0600
Subject: Remove unused header <linux/sdb.h>

Commit 6a80b30086b8 ("fmc: Delete the FMC subsystem") removed the last user
of <linux/sdb.h>, but left the header file behind.  Nothing uses this file,
delete it now.

Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Alessandro Rubini <rubini@gnudd.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Acked-by: Alessandro Rubini <rubini@gnudd.com>
Link: https://lore.kernel.org/r/20211102220203.940290-5-corbet@lwn.net
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/sdb.h | 160 ----------------------------------------------------
 1 file changed, 160 deletions(-)
 delete mode 100644 include/linux/sdb.h

(limited to 'include/linux')

diff --git a/include/linux/sdb.h b/include/linux/sdb.h
deleted file mode 100644
index a2404a2bbd10..000000000000
--- a/include/linux/sdb.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This is the official version 1.1 of sdb.h
- */
-#ifndef __SDB_H__
-#define __SDB_H__
-#ifdef __KERNEL__
-#include <linux/types.h>
-#else
-#include <stdint.h>
-#endif
-
-/*
- * All structures are 64 bytes long and are expected
- * to live in an array, one for each interconnect.
- * Most fields of the structures are shared among the
- * various types, and most-specific fields are at the
- * beginning (for alignment reasons, and to keep the
- * magic number at the head of the interconnect record
- */
-
-/* Product, 40 bytes at offset 24, 8-byte aligned
- *
- * device_id is vendor-assigned; version is device-specific,
- * date is hex (e.g 0x20120501), name is UTF-8, blank-filled
- * and not terminated with a 0 byte.
- */
-struct sdb_product {
-	uint64_t		vendor_id;	/* 0x18..0x1f */
-	uint32_t		device_id;	/* 0x20..0x23 */
-	uint32_t		version;	/* 0x24..0x27 */
-	uint32_t		date;		/* 0x28..0x2b */
-	uint8_t			name[19];	/* 0x2c..0x3e */
-	uint8_t			record_type;	/* 0x3f */
-};
-
-/*
- * Component, 56 bytes at offset 8, 8-byte aligned
- *
- * The address range is first to last, inclusive
- * (for example 0x100000 - 0x10ffff)
- */
-struct sdb_component {
-	uint64_t		addr_first;	/* 0x08..0x0f */
-	uint64_t		addr_last;	/* 0x10..0x17 */
-	struct sdb_product	product;	/* 0x18..0x3f */
-};
-
-/* Type of the SDB record */
-enum sdb_record_type {
-	sdb_type_interconnect	= 0x00,
-	sdb_type_device		= 0x01,
-	sdb_type_bridge		= 0x02,
-	sdb_type_integration	= 0x80,
-	sdb_type_repo_url	= 0x81,
-	sdb_type_synthesis	= 0x82,
-	sdb_type_empty		= 0xFF,
-};
-
-/* Type 0: interconnect (first of the array)
- *
- * sdb_records is the length of the table including this first
- * record, version is 1. The bus type is enumerated later.
- */
-#define				SDB_MAGIC	0x5344422d /* "SDB-" */
-struct sdb_interconnect {
-	uint32_t		sdb_magic;	/* 0x00-0x03 */
-	uint16_t		sdb_records;	/* 0x04-0x05 */
-	uint8_t			sdb_version;	/* 0x06 */
-	uint8_t			sdb_bus_type;	/* 0x07 */
-	struct sdb_component	sdb_component;	/* 0x08-0x3f */
-};
-
-/* Type 1: device
- *
- * class is 0 for "custom device", other values are
- * to be standardized; ABI version is for the driver,
- * bus-specific bits are defined by each bus (see below)
- */
-struct sdb_device {
-	uint16_t		abi_class;	/* 0x00-0x01 */
-	uint8_t			abi_ver_major;	/* 0x02 */
-	uint8_t			abi_ver_minor;	/* 0x03 */
-	uint32_t		bus_specific;	/* 0x04-0x07 */
-	struct sdb_component	sdb_component;	/* 0x08-0x3f */
-};
-
-/* Type 2: bridge
- *
- * child is the address of the nested SDB table
- */
-struct sdb_bridge {
-	uint64_t		sdb_child;	/* 0x00-0x07 */
-	struct sdb_component	sdb_component;	/* 0x08-0x3f */
-};
-
-/* Type 0x80: integration
- *
- * all types with bit 7 set are meta-information, so
- * software can ignore the types it doesn't know. Here we
- * just provide product information for an aggregate device
- */
-struct sdb_integration {
-	uint8_t			reserved[24];	/* 0x00-0x17 */
-	struct sdb_product	product;	/* 0x08-0x3f */
-};
-
-/* Type 0x81: Top module repository url
- *
- * again, an informative field that software can ignore
- */
-struct sdb_repo_url {
-	uint8_t			repo_url[63];	/* 0x00-0x3e */
-	uint8_t			record_type;	/* 0x3f */
-};
-
-/* Type 0x82: Synthesis tool information
- *
- * this informative record
- */
-struct sdb_synthesis {
-	uint8_t			syn_name[16];	/* 0x00-0x0f */
-	uint8_t			commit_id[16];	/* 0x10-0x1f */
-	uint8_t			tool_name[8];	/* 0x20-0x27 */
-	uint32_t		tool_version;	/* 0x28-0x2b */
-	uint32_t		date;		/* 0x2c-0x2f */
-	uint8_t			user_name[15];	/* 0x30-0x3e */
-	uint8_t			record_type;	/* 0x3f */
-};
-
-/* Type 0xff: empty
- *
- * this allows keeping empty slots during development,
- * so they can be filled later with minimal efforts and
- * no misleading description is ever shipped -- hopefully.
- * It can also be used to pad a table to a desired length.
- */
-struct sdb_empty {
-	uint8_t			reserved[63];	/* 0x00-0x3e */
-	uint8_t			record_type;	/* 0x3f */
-};
-
-/* The type of bus, for bus-specific flags */
-enum sdb_bus_type {
-	sdb_wishbone = 0x00,
-	sdb_data     = 0x01,
-};
-
-#define SDB_WB_WIDTH_MASK	0x0f
-#define SDB_WB_ACCESS8			0x01
-#define SDB_WB_ACCESS16			0x02
-#define SDB_WB_ACCESS32			0x04
-#define SDB_WB_ACCESS64			0x08
-#define SDB_WB_LITTLE_ENDIAN	0x80
-
-#define SDB_DATA_READ		0x04
-#define SDB_DATA_WRITE		0x02
-#define SDB_DATA_EXEC		0x01
-
-#endif /* __SDB_H__ */
-- 
cgit v1.2.3


From 353050be4c19e102178ccc05988101887c25ae53 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Nov 2021 18:48:08 +0000
Subject: bpf: Fix toctou on read-only map's constant scalar tracking

Commit a23740ec43ba ("bpf: Track contents of read-only maps as scalars") is
checking whether maps are read-only both from BPF program side and user space
side, and then, given their content is constant, reading out their data via
map->ops->map_direct_value_addr() which is then subsequently used as known
scalar value for the register, that is, it is marked as __mark_reg_known()
with the read value at verification time. Before a23740ec43ba, the register
content was marked as an unknown scalar so the verifier could not make any
assumptions about the map content.

The current implementation however is prone to a TOCTOU race, meaning, the
value read as known scalar for the register is not guaranteed to be exactly
the same at a later point when the program is executed, and as such, the
prior made assumptions of the verifier with regards to the program will be
invalid which can cause issues such as OOB access, etc.

While the BPF_F_RDONLY_PROG map flag is always fixed and required to be
specified at map creation time, the map->frozen property is initially set to
false for the map given the map value needs to be populated, e.g. for global
data sections. Once complete, the loader "freezes" the map from user space
such that no subsequent updates/deletes are possible anymore. For the rest
of the lifetime of the map, this freeze one-time trigger cannot be undone
anymore after a successful BPF_MAP_FREEZE cmd return. Meaning, any new BPF_*
cmd calls which would update/delete map entries will be rejected with -EPERM
since map_get_sys_perms() removes the FMODE_CAN_WRITE permission. This also
means that pending update/delete map entries must still complete before this
guarantee is given. This corner case is not an issue for loaders since they
create and prepare such program private map in successive steps.

However, a malicious user is able to trigger this TOCTOU race in two different
ways: i) via userfaultfd, and ii) via batched updates. For i) userfaultfd is
used to expand the competition interval, so that map_update_elem() can modify
the contents of the map after map_freeze() and bpf_prog_load() were executed.
This works, because userfaultfd halts the parallel thread which triggered a
map_update_elem() at the time where we copy key/value from the user buffer and
this already passed the FMODE_CAN_WRITE capability test given at that time the
map was not "frozen". Then, the main thread performs the map_freeze() and
bpf_prog_load(), and once that had completed successfully, the other thread
is woken up to complete the pending map_update_elem() which then changes the
map content. For ii) the idea of the batched update is similar, meaning, when
there are a large number of updates to be processed, it can increase the
competition interval between the two. It is therefore possible in practice to
modify the contents of the map after executing map_freeze() and bpf_prog_load().

One way to fix both i) and ii) at the same time is to expand the use of the
map's map->writecnt. The latter was introduced in fc9702273e2e ("bpf: Add mmap()
support for BPF_MAP_TYPE_ARRAY") and further refined in 1f6cb19be2e2 ("bpf:
Prevent re-mmap()'ing BPF map as writable for initially r/o mapping") with
the rationale to make a writable mmap()'ing of a map mutually exclusive with
read-only freezing. The counter indicates writable mmap() mappings and then
prevents/fails the freeze operation. Its semantics can be expanded beyond
just mmap() by generally indicating ongoing write phases. This would essentially
span any parallel regular and batched flavor of update/delete operation and
then also have map_freeze() fail with -EBUSY. For the check_mem_access() in
the verifier we expand upon the bpf_map_is_rdonly() check ensuring that all
last pending writes have completed via bpf_map_write_active() test. Once the
map->frozen is set and bpf_map_write_active() indicates a map->writecnt of 0
only then we are really guaranteed to use the map's data as known constants.
For map->frozen being set and pending writes in process of still being completed
we fall back to marking that register as unknown scalar so we don't end up
making assumptions about it. With this, both TOCTOU reproducers from i) and
ii) are fixed.

Note that the map->writecnt has been converted into a atomic64 in the fix in
order to avoid a double freeze_mutex mutex_{un,}lock() pair when updating
map->writecnt in the various map update/delete BPF_* cmd flavors. Spanning
the freeze_mutex over entire map update/delete operations in syscall side
would not be possible due to then causing everything to be serialized.
Similarly, something like synchronize_rcu() after setting map->frozen to wait
for update/deletes to complete is not possible either since it would also
have to span the user copy which can sleep. On the libbpf side, this won't
break d66562fba1ce ("libbpf: Add BPF object skeleton support") as the
anonymous mmap()-ed "map initialization image" is remapped as a BPF map-backed
mmap()-ed memory where for .rodata it's non-writable.

Fixes: a23740ec43ba ("bpf: Track contents of read-only maps as scalars")
Reported-by: w1tcher.bupt@gmail.com
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f715e8863f4d..e7a163a3146b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -193,7 +193,7 @@ struct bpf_map {
 	atomic64_t usercnt;
 	struct work_struct work;
 	struct mutex freeze_mutex;
-	u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */
+	atomic64_t writecnt;
 };
 
 static inline bool map_value_has_spin_lock(const struct bpf_map *map)
@@ -1419,6 +1419,7 @@ void bpf_map_put(struct bpf_map *map);
 void *bpf_map_area_alloc(u64 size, int numa_node);
 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
 void bpf_map_area_free(void *base);
+bool bpf_map_write_active(const struct bpf_map *map);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 int  generic_map_lookup_batch(struct bpf_map *map,
 			      const union bpf_attr *attr,
-- 
cgit v1.2.3


From 0f0ac158d28ff78e75c334e869b1cb8e69372a1f Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sun, 24 Oct 2021 16:37:05 +1300
Subject: platform/x86: asus-wmi: Add support for custom fan curves

Add support for custom fan curves found on some ASUS ROG laptops.

These laptops have the ability to set a custom curve for the CPU
and GPU fans via two ACPI methods.

This patch adds two pwm<N> attributes to the hwmon sysfs,
pwm1 for CPU fan, pwm2 for GPU fan. Both are under the hwmon of the
name `asus_custom_fan_curve`. There is no safety check of the set
fan curves - this must be done in userspace.

The fans have settings [1,2,3] under pwm<N>_enable:
1. Enable and write settings out
2. Disable and use factory fan mode
3. Same as 2, additionally restoring default factory curve.

Use of 2 means that the curve the user has set is still stored and
won't be erased, but the laptop will be using its default auto-fan
mode. Re-enabling the manual mode then activates the curves again.

Notes:
- pwm<N>_enable = 0 is an invalid setting.
- pwm is actually a percentage and is scaled on writing to device.

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20211024033705.5595-2-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 17dc5cb6f3f2..a571b47ff362 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -77,6 +77,8 @@
 #define ASUS_WMI_DEVID_THERMAL_CTRL	0x00110011
 #define ASUS_WMI_DEVID_FAN_CTRL		0x00110012 /* deprecated */
 #define ASUS_WMI_DEVID_CPU_FAN_CTRL	0x00110013
+#define ASUS_WMI_DEVID_CPU_FAN_CURVE	0x00110024
+#define ASUS_WMI_DEVID_GPU_FAN_CURVE	0x00110025
 
 /* Power */
 #define ASUS_WMI_DEVID_PROCESSOR_STATE	0x00120012
-- 
cgit v1.2.3


From 38543b72fbe52b7eec0dedd420d80a06c652d8e4 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Thu, 28 Oct 2021 02:22:41 +0200
Subject: platform/surface: aggregator: Make client device removal more generic

Currently, there are similar functions defined in the Aggregator
Registry and the controller core.

Make client device removal more generic and export it. We can then use
this function later on to remove client devices from device hubs as well
as the controller and avoid re-defining similar things.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20211028002243.1586083-2-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/surface_aggregator/device.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index f636c5310321..cc257097eb05 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -319,6 +319,15 @@ void ssam_device_driver_unregister(struct ssam_device_driver *d);
 		      ssam_device_driver_unregister)
 
 
+/* -- Helpers for controller and hub devices. ------------------------------- */
+
+#ifdef CONFIG_SURFACE_AGGREGATOR_BUS
+void ssam_remove_clients(struct device *dev);
+#else /* CONFIG_SURFACE_AGGREGATOR_BUS */
+static inline void ssam_remove_clients(struct device *dev) {}
+#endif /* CONFIG_SURFACE_AGGREGATOR_BUS */
+
+
 /* -- Helpers for client-device requests. ----------------------------------- */
 
 /**
-- 
cgit v1.2.3


From ebf7f6f0a6cdcc17a3da52b81e4b3a98c4005028 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Fri, 5 Nov 2021 09:30:00 +0800
Subject: bpf: Change value of MAX_TAIL_CALL_CNT from 32 to 33
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the current code, the actual max tail call count is 33 which is greater
than MAX_TAIL_CALL_CNT (defined as 32). The actual limit is not consistent
with the meaning of MAX_TAIL_CALL_CNT and thus confusing at first glance.
We can see the historical evolution from commit 04fd61ab36ec ("bpf: allow
bpf programs to tail-call other bpf programs") and commit f9dabe016b63
("bpf: Undo off-by-one in interpreter tail call count limit"). In order
to avoid changing existing behavior, the actual limit is 33 now, this is
reasonable.

After commit 874be05f525e ("bpf, tests: Add tail call test suite"), we can
see there exists failed testcase.

On all archs when CONFIG_BPF_JIT_ALWAYS_ON is not set:
 # echo 0 > /proc/sys/net/core/bpf_jit_enable
 # modprobe test_bpf
 # dmesg | grep -w FAIL
 Tail call error path, max count reached jited:0 ret 34 != 33 FAIL

On some archs:
 # echo 1 > /proc/sys/net/core/bpf_jit_enable
 # modprobe test_bpf
 # dmesg | grep -w FAIL
 Tail call error path, max count reached jited:1 ret 34 != 33 FAIL

Although the above failed testcase has been fixed in commit 18935a72eb25
("bpf/tests: Fix error in tail call limit tests"), it would still be good
to change the value of MAX_TAIL_CALL_CNT from 32 to 33 to make the code
more readable.

The 32-bit x86 JIT was using a limit of 32, just fix the wrong comments and
limit to 33 tail calls as the constant MAX_TAIL_CALL_CNT updated. For the
mips64 JIT, use "ori" instead of "addiu" as suggested by Johan Almbladh.
For the riscv JIT, use RV_REG_TCC directly to save one register move as
suggested by Björn Töpel. For the other implementations, no function changes,
it does not change the current limit 33, the new value of MAX_TAIL_CALL_CNT
can reflect the actual max tail call count, the related tail call testcases
in test_bpf module and selftests can work well for the interpreter and the
JIT.

Here are the test results on x86_64:

 # uname -m
 x86_64
 # echo 0 > /proc/sys/net/core/bpf_jit_enable
 # modprobe test_bpf test_suite=test_tail_calls
 # dmesg | tail -1
 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [0/8 JIT'ed]
 # rmmod test_bpf
 # echo 1 > /proc/sys/net/core/bpf_jit_enable
 # modprobe test_bpf test_suite=test_tail_calls
 # dmesg | tail -1
 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [8/8 JIT'ed]
 # rmmod test_bpf
 # ./test_progs -t tailcalls
 #142 tailcalls:OK
 Summary: 1/11 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>
Acked-by: Björn Töpel <bjorn@kernel.org>
Acked-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/bpf/1636075800-3264-1-git-send-email-yangtiezhu@loongson.cn
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 56098c866704..cc7a0c36e7df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1081,7 +1081,7 @@ struct bpf_array {
 };
 
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
-#define MAX_TAIL_CALL_CNT 32
+#define MAX_TAIL_CALL_CNT 33
 
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
-- 
cgit v1.2.3


From 42f67eea3ba36cef2dce2e853de6ddcb2e89eb39 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Nov 2021 11:02:33 -0800
Subject: net: use sk_is_tcp() in more places

Move sk_is_tcp() to include/net/sock.h and use it where we can.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skmsg.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 584d94be9c8b..18a717fe62eb 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -507,12 +507,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 	return !!psock->saved_data_ready;
 }
 
-static inline bool sk_is_tcp(const struct sock *sk)
-{
-	return sk->sk_type == SOCK_STREAM &&
-	       sk->sk_protocol == IPPROTO_TCP;
-}
-
 static inline bool sk_is_udp(const struct sock *sk)
 {
 	return sk->sk_type == SOCK_DGRAM &&
-- 
cgit v1.2.3


From f35f821935d8df76f9c92e2431a225bdff938169 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Nov 2021 11:02:46 -0800
Subject: tcp: defer skb freeing after socket lock is released

tcp recvmsg() (or rx zerocopy) spends a fair amount of time
freeing skbs after their payload has been consumed.

A typical ~64KB GRO packet has to release ~45 page
references, eventually going to page allocator
for each of them.

Currently, this freeing is performed while socket lock
is held, meaning that there is a high chance that
BH handler has to queue incoming packets to tcp socket backlog.

This can cause additional latencies, because the user
thread has to process the backlog at release_sock() time,
and while doing so, additional frames can be added
by BH handler.

This patch adds logic to defer these frees after socket
lock is released, or directly from BH handler if possible.

Being able to free these skbs from BH handler helps a lot,
because this avoids the usual alloc/free assymetry,
when BH handler and user thread do not run on same cpu or
NUMA node.

One cpu can now be fully utilized for the kernel->user copy,
and another cpu is handling BH processing and skb/page
allocs/frees (assuming RFS is not forcing use of a single CPU)

Tested:
 100Gbit NIC
 Max throughput for one TCP_STREAM flow, over 10 runs

MTU : 1500
Before: 55 Gbit
After:  66 Gbit

MTU : 4096+(headers)
Before: 82 Gbit
After:  95 Gbit

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 686a666d073d..b8b806512e16 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -36,6 +36,7 @@
 #include <linux/splice.h>
 #include <linux/in6.h>
 #include <linux/if_packet.h>
+#include <linux/llist.h>
 #include <net/flow.h>
 #include <net/page_pool.h>
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -743,6 +744,7 @@ struct sk_buff {
 		};
 		struct rb_node		rbnode; /* used in netem, ip4 defrag, and tcp stack */
 		struct list_head	list;
+		struct llist_node	ll_node;
 	};
 
 	union {
-- 
cgit v1.2.3


From 4721031c3559db8eae61df305f10c00099a7c1d0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Nov 2021 09:05:51 -0800
Subject: net: move gro definitions to include/net/gro.h

include/linux/netdevice.h became too big, move gro stuff
into include/net/gro.h

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 348 ----------------------------------------------
 1 file changed, 348 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec42495a43a..d95c9839ce90 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2520,109 +2520,6 @@ static inline void netif_napi_del(struct napi_struct *napi)
 	synchronize_net();
 }
 
-struct napi_gro_cb {
-	/* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */
-	void	*frag0;
-
-	/* Length of frag0. */
-	unsigned int frag0_len;
-
-	/* This indicates where we are processing relative to skb->data. */
-	int	data_offset;
-
-	/* This is non-zero if the packet cannot be merged with the new skb. */
-	u16	flush;
-
-	/* Save the IP ID here and check when we get to the transport layer */
-	u16	flush_id;
-
-	/* Number of segments aggregated. */
-	u16	count;
-
-	/* Start offset for remote checksum offload */
-	u16	gro_remcsum_start;
-
-	/* jiffies when first packet was created/queued */
-	unsigned long age;
-
-	/* Used in ipv6_gro_receive() and foo-over-udp */
-	u16	proto;
-
-	/* This is non-zero if the packet may be of the same flow. */
-	u8	same_flow:1;
-
-	/* Used in tunnel GRO receive */
-	u8	encap_mark:1;
-
-	/* GRO checksum is valid */
-	u8	csum_valid:1;
-
-	/* Number of checksums via CHECKSUM_UNNECESSARY */
-	u8	csum_cnt:3;
-
-	/* Free the skb? */
-	u8	free:2;
-#define NAPI_GRO_FREE		  1
-#define NAPI_GRO_FREE_STOLEN_HEAD 2
-
-	/* Used in foo-over-udp, set in udp[46]_gro_receive */
-	u8	is_ipv6:1;
-
-	/* Used in GRE, set in fou/gue_gro_receive */
-	u8	is_fou:1;
-
-	/* Used to determine if flush_id can be ignored */
-	u8	is_atomic:1;
-
-	/* Number of gro_receive callbacks this packet already went through */
-	u8 recursion_counter:4;
-
-	/* GRO is done by frag_list pointer chaining. */
-	u8	is_flist:1;
-
-	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
-	__wsum	csum;
-
-	/* used in skb_gro_receive() slow path */
-	struct sk_buff *last;
-};
-
-#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
-
-#define GRO_RECURSION_LIMIT 15
-static inline int gro_recursion_inc_test(struct sk_buff *skb)
-{
-	return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT;
-}
-
-typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *);
-static inline struct sk_buff *call_gro_receive(gro_receive_t cb,
-					       struct list_head *head,
-					       struct sk_buff *skb)
-{
-	if (unlikely(gro_recursion_inc_test(skb))) {
-		NAPI_GRO_CB(skb)->flush |= 1;
-		return NULL;
-	}
-
-	return cb(head, skb);
-}
-
-typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *,
-					    struct sk_buff *);
-static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb,
-						  struct sock *sk,
-						  struct list_head *head,
-						  struct sk_buff *skb)
-{
-	if (unlikely(gro_recursion_inc_test(skb))) {
-		NAPI_GRO_CB(skb)->flush |= 1;
-		return NULL;
-	}
-
-	return cb(sk, head, skb);
-}
-
 struct packet_type {
 	__be16			type;	/* This is really htons(ether_type). */
 	bool			ignore_outgoing;
@@ -3008,251 +2905,6 @@ int dev_restart(struct net_device *dev);
 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
 int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
 
-static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
-{
-	return NAPI_GRO_CB(skb)->data_offset;
-}
-
-static inline unsigned int skb_gro_len(const struct sk_buff *skb)
-{
-	return skb->len - NAPI_GRO_CB(skb)->data_offset;
-}
-
-static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
-{
-	NAPI_GRO_CB(skb)->data_offset += len;
-}
-
-static inline void *skb_gro_header_fast(struct sk_buff *skb,
-					unsigned int offset)
-{
-	return NAPI_GRO_CB(skb)->frag0 + offset;
-}
-
-static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen)
-{
-	return NAPI_GRO_CB(skb)->frag0_len < hlen;
-}
-
-static inline void skb_gro_frag0_invalidate(struct sk_buff *skb)
-{
-	NAPI_GRO_CB(skb)->frag0 = NULL;
-	NAPI_GRO_CB(skb)->frag0_len = 0;
-}
-
-static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen,
-					unsigned int offset)
-{
-	if (!pskb_may_pull(skb, hlen))
-		return NULL;
-
-	skb_gro_frag0_invalidate(skb);
-	return skb->data + offset;
-}
-
-static inline void *skb_gro_network_header(struct sk_buff *skb)
-{
-	return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) +
-	       skb_network_offset(skb);
-}
-
-static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
-					const void *start, unsigned int len)
-{
-	if (NAPI_GRO_CB(skb)->csum_valid)
-		NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum,
-						  csum_partial(start, len, 0));
-}
-
-/* GRO checksum functions. These are logical equivalents of the normal
- * checksum functions (in skbuff.h) except that they operate on the GRO
- * offsets and fields in sk_buff.
- */
-
-__sum16 __skb_gro_checksum_complete(struct sk_buff *skb);
-
-static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb)
-{
-	return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb));
-}
-
-static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
-						      bool zero_okay,
-						      __sum16 check)
-{
-	return ((skb->ip_summed != CHECKSUM_PARTIAL ||
-		skb_checksum_start_offset(skb) <
-		 skb_gro_offset(skb)) &&
-		!skb_at_gro_remcsum_start(skb) &&
-		NAPI_GRO_CB(skb)->csum_cnt == 0 &&
-		(!zero_okay || check));
-}
-
-static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb,
-							   __wsum psum)
-{
-	if (NAPI_GRO_CB(skb)->csum_valid &&
-	    !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum)))
-		return 0;
-
-	NAPI_GRO_CB(skb)->csum = psum;
-
-	return __skb_gro_checksum_complete(skb);
-}
-
-static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
-{
-	if (NAPI_GRO_CB(skb)->csum_cnt > 0) {
-		/* Consume a checksum from CHECKSUM_UNNECESSARY */
-		NAPI_GRO_CB(skb)->csum_cnt--;
-	} else {
-		/* Update skb for CHECKSUM_UNNECESSARY and csum_level when we
-		 * verified a new top level checksum or an encapsulated one
-		 * during GRO. This saves work if we fallback to normal path.
-		 */
-		__skb_incr_checksum_unnecessary(skb);
-	}
-}
-
-#define __skb_gro_checksum_validate(skb, proto, zero_okay, check,	\
-				    compute_pseudo)			\
-({									\
-	__sum16 __ret = 0;						\
-	if (__skb_gro_checksum_validate_needed(skb, zero_okay, check))	\
-		__ret = __skb_gro_checksum_validate_complete(skb,	\
-				compute_pseudo(skb, proto));		\
-	if (!__ret)							\
-		skb_gro_incr_csum_unnecessary(skb);			\
-	__ret;								\
-})
-
-#define skb_gro_checksum_validate(skb, proto, compute_pseudo)		\
-	__skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo)
-
-#define skb_gro_checksum_validate_zero_check(skb, proto, check,		\
-					     compute_pseudo)		\
-	__skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo)
-
-#define skb_gro_checksum_simple_validate(skb)				\
-	__skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo)
-
-static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb)
-{
-	return (NAPI_GRO_CB(skb)->csum_cnt == 0 &&
-		!NAPI_GRO_CB(skb)->csum_valid);
-}
-
-static inline void __skb_gro_checksum_convert(struct sk_buff *skb,
-					      __wsum pseudo)
-{
-	NAPI_GRO_CB(skb)->csum = ~pseudo;
-	NAPI_GRO_CB(skb)->csum_valid = 1;
-}
-
-#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo)	\
-do {									\
-	if (__skb_gro_checksum_convert_check(skb))			\
-		__skb_gro_checksum_convert(skb, 			\
-					   compute_pseudo(skb, proto));	\
-} while (0)
-
-struct gro_remcsum {
-	int offset;
-	__wsum delta;
-};
-
-static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
-{
-	grc->offset = 0;
-	grc->delta = 0;
-}
-
-static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
-					    unsigned int off, size_t hdrlen,
-					    int start, int offset,
-					    struct gro_remcsum *grc,
-					    bool nopartial)
-{
-	__wsum delta;
-	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
-
-	BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
-
-	if (!nopartial) {
-		NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start;
-		return ptr;
-	}
-
-	ptr = skb_gro_header_fast(skb, off);
-	if (skb_gro_header_hard(skb, off + plen)) {
-		ptr = skb_gro_header_slow(skb, off + plen, off);
-		if (!ptr)
-			return NULL;
-	}
-
-	delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum,
-			       start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
-
-	grc->offset = off + hdrlen + offset;
-	grc->delta = delta;
-
-	return ptr;
-}
-
-static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
-					   struct gro_remcsum *grc)
-{
-	void *ptr;
-	size_t plen = grc->offset + sizeof(u16);
-
-	if (!grc->delta)
-		return;
-
-	ptr = skb_gro_header_fast(skb, grc->offset);
-	if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) {
-		ptr = skb_gro_header_slow(skb, plen, grc->offset);
-		if (!ptr)
-			return;
-	}
-
-	remcsum_unadjust((__sum16 *)ptr, grc->delta);
-}
-
-#ifdef CONFIG_XFRM_OFFLOAD
-static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
-{
-	if (PTR_ERR(pp) != -EINPROGRESS)
-		NAPI_GRO_CB(skb)->flush |= flush;
-}
-static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
-					       struct sk_buff *pp,
-					       int flush,
-					       struct gro_remcsum *grc)
-{
-	if (PTR_ERR(pp) != -EINPROGRESS) {
-		NAPI_GRO_CB(skb)->flush |= flush;
-		skb_gro_remcsum_cleanup(skb, grc);
-		skb->remcsum_offload = 0;
-	}
-}
-#else
-static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
-{
-	NAPI_GRO_CB(skb)->flush |= flush;
-}
-static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
-					       struct sk_buff *pp,
-					       int flush,
-					       struct gro_remcsum *grc)
-{
-	NAPI_GRO_CB(skb)->flush |= flush;
-	skb_gro_remcsum_cleanup(skb, grc);
-	skb->remcsum_offload = 0;
-}
-#endif
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
-- 
cgit v1.2.3


From 0b935d7f8c07bf0a192712bdbf76dbf45ef8b115 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Nov 2021 09:05:52 -0800
Subject: net: gro: move skb_gro_receive_list to udp_offload.c

This helper is used once, no need to keep it in fat net/core/skbuff.c

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d95c9839ce90..ce6ee1453dbc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2903,7 +2903,6 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id);
 int netdev_get_name(struct net *net, char *name, int ifindex);
 int dev_restart(struct net_device *dev);
 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
-int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
 
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
-- 
cgit v1.2.3


From e456a18a390b96f22b0de2acd4d0f49c72ed2280 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Nov 2021 09:05:53 -0800
Subject: net: gro: move skb_gro_receive into net/core/gro.c

net/core/gro.c will contain all core gro functions,
to shrink net/core/skbuff.c and net/core/dev.c

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ce6ee1453dbc..93d397db9ec4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2902,7 +2902,6 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
 struct net_device *dev_get_by_napi_id(unsigned int napi_id);
 int netdev_get_name(struct net *net, char *name, int ifindex);
 int dev_restart(struct net_device *dev);
-int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
 
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
-- 
cgit v1.2.3


From 587652bbdd06ab38a4c1b85e40f933d2cf4a1147 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Nov 2021 09:05:54 -0800
Subject: net: gro: populate net/core/gro.c

Move gro code and data from net/core/dev.c to net/core/gro.c
to ease maintenance.

gro_normal_list() and gro_normal_one() are inlined
because they are called from both files.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 93d397db9ec4..31a7e6b27681 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3657,6 +3657,7 @@ int netif_rx_ni(struct sk_buff *skb);
 int netif_rx_any_context(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
 int netif_receive_skb_core(struct sk_buff *skb);
+void netif_receive_skb_list_internal(struct list_head *head);
 void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
-- 
cgit v1.2.3


From 77d641baa3c8e18a1056bec6c64c6103c1a17b1e Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca@lucaceresoli.net>
Date: Mon, 8 Nov 2021 17:27:05 +0100
Subject: power: supply: core: add POWER_SUPPLY_HEALTH_NO_BATTERY

Some chargers can keep the system powered from the mains even when no
battery is present. It this case none of the currently defined health
statuses applies. Add a new status to report that no battery is present.

Suggested-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Signed-off-by: Luca Ceresoli <luca@lucaceresoli.net>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power_supply.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 9ca1f120a211..2d1318fe2455 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -66,6 +66,7 @@ enum {
 	POWER_SUPPLY_HEALTH_WARM,
 	POWER_SUPPLY_HEALTH_COOL,
 	POWER_SUPPLY_HEALTH_HOT,
+	POWER_SUPPLY_HEALTH_NO_BATTERY,
 };
 
 enum {
-- 
cgit v1.2.3


From d7751d6476185ff754b9dad2cba0c0a6e43ecadc Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@nvidia.com>
Date: Thu, 20 May 2021 17:09:58 +0300
Subject: net/mlx5: E-Switch, Fix resetting of encap mode when entering
 switchdev

E-Switch encap mode is relevant only when in switchdev mode.
The RDMA driver can query the encap configuration via
mlx5_eswitch_get_encap_mode(). Make sure it returns the currently
used mode and not the set one.

This reverts the cited commit which reset the encap mode
on entering switchdev and fixes the original issue properly.

Fixes: 9a64144d683a ("net/mlx5: E-Switch, Fix default encap mode")
Signed-off-by: Paul Blakey <paulb@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/eswitch.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 97afcea39a7b..8b18fe9771f9 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -145,13 +145,13 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw,
 	GENMASK(31 - ESW_TUN_ID_BITS - ESW_RESERVED_BITS, \
 		ESW_TUN_OPTS_OFFSET + 1)
 
-u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev);
+u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev);
 u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev);
 struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw);
 
 #else  /* CONFIG_MLX5_ESWITCH */
 
-static inline u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev)
+static inline u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev)
 {
 	return MLX5_ESWITCH_NONE;
 }
-- 
cgit v1.2.3


From c2c60ea37e5b6be58c9dd7aff0b2e86ba0f18e0b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Nov 2021 09:23:01 -0800
Subject: once: use __section(".data.once")

.data.once contains nicely packed bool variables.
It is used already by DO_ONCE_LITE().

Using it also in DO_ONCE() removes holes in .data section.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/once.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/once.h b/include/linux/once.h
index d361fb14ac3a..f54523052bbc 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -38,7 +38,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 #define DO_ONCE(func, ...)						     \
 	({								     \
 		bool ___ret = false;					     \
-		static bool ___done = false;				     \
+		static bool __section(".data.once") ___done = false;	     \
 		static DEFINE_STATIC_KEY_TRUE(___once_key);		     \
 		if (static_branch_unlikely(&___once_key)) {		     \
 			unsigned long ___flags;				     \
-- 
cgit v1.2.3


From 7071732c26fe2cf141185ed16a8a85d02495ae8c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Nov 2021 09:23:02 -0800
Subject: net: use .data.once section in netdev_level_once()

Same rationale than prior patch : using the dedicated
section avoid holes and pack all these bool values.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 31a7e6b27681..dd328364dfe9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4942,7 +4942,7 @@ void netdev_info(const struct net_device *dev, const char *format, ...);
 
 #define netdev_level_once(level, dev, fmt, ...)			\
 do {								\
-	static bool __print_once __read_mostly;			\
+	static bool __section(".data.once") __print_once;	\
 								\
 	if (!__print_once) {					\
 		__print_once = true;				\
-- 
cgit v1.2.3


From 49ecc2e9c3abd269951972fa8b23a4d081111b80 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Nov 2021 09:23:03 -0800
Subject: net: align static siphash keys

siphash keys use 16 bytes.

Define siphash_aligned_key_t macro so that we can make sure they
are not crossing a cache line boundary.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/siphash.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/siphash.h b/include/linux/siphash.h
index bf21591a9e5e..3f7427b9e935 100644
--- a/include/linux/siphash.h
+++ b/include/linux/siphash.h
@@ -21,6 +21,8 @@ typedef struct {
 	u64 key[2];
 } siphash_key_t;
 
+#define siphash_aligned_key_t siphash_key_t __aligned(16)
+
 static inline bool siphash_key_is_zero(const siphash_key_t *key)
 {
 	return !(key->key[0] | key->key[1]);
-- 
cgit v1.2.3


From b9241f54138ca5af4d3c5ca6db56be83d7491508 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 15 Nov 2021 17:11:17 +0000
Subject: net: document SMII and correct phylink's new validation mechanism

SMII has not been documented in the kernel, but information on this PHY
interface mode has been recently found. Document it, and correct the
recently introduced phylink handling for this interface mode.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/E1mmfVl-0075nP-14@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 96e43fbb2dd8..1e57cdd95da3 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -99,7 +99,7 @@ extern const int phy_10gbit_features_array[1];
  * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay
  * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay
  * @PHY_INTERFACE_MODE_RTBI: Reduced TBI
- * @PHY_INTERFACE_MODE_SMII: ??? MII
+ * @PHY_INTERFACE_MODE_SMII: Serial MII
  * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface
  * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface
  * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax
-- 
cgit v1.2.3


From 5854d4a6cc356ba3e16d8593ac1c089a32d1759c Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@microchip.com>
Date: Fri, 29 Oct 2021 20:26:12 +0300
Subject: mtd: spi-nor: Get rid of nor->page_size

nor->page_size duplicated what nor->params->page_size indicates
for no good reason. page_size is a flash parameter of fixed value
and it is better suited to be found in nor->params->page_size.

Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Reviewed-by: Pratyush Yadav <p.yadav@ti.com>
Reviewed-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20211029172633.886453-5-tudor.ambarus@microchip.com
---
 include/linux/mtd/spi-nor.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index f67457748ed8..fc90fce26e33 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -371,7 +371,6 @@ struct spi_nor_flash_parameter;
  * @bouncebuf_size:	size of the bounce buffer
  * @info:		SPI NOR part JEDEC MFR ID and other info
  * @manufacturer:	SPI NOR manufacturer
- * @page_size:		the page size of the SPI NOR
  * @addr_width:		number of address bytes
  * @erase_opcode:	the opcode for erasing a sector
  * @read_opcode:	the read opcode
@@ -401,7 +400,6 @@ struct spi_nor {
 	size_t			bouncebuf_size;
 	const struct flash_info	*info;
 	const struct spi_nor_manufacturer *manufacturer;
-	u32			page_size;
 	u8			addr_width;
 	u8			erase_opcode;
 	u8			read_opcode;
-- 
cgit v1.2.3


From 1e2c3ef0496e72ba9001da5fd1b7ed56ccb30597 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Mon, 4 Oct 2021 16:11:52 +0200
Subject: tee: export teedev_open() and teedev_close_context()

Exports the two functions teedev_open() and teedev_close_context() in
order to make it easier to create a driver internal struct tee_context.

Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 include/linux/tee_drv.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index a1f03461369b..468a7d83dc6c 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -587,4 +587,18 @@ struct tee_client_driver {
 #define to_tee_client_driver(d) \
 		container_of(d, struct tee_client_driver, driver)
 
+/**
+ * teedev_open() - Open a struct tee_device
+ * @teedev:	Device to open
+ *
+ * @return a pointer to struct tee_context on success or an ERR_PTR on failure.
+ */
+struct tee_context *teedev_open(struct tee_device *teedev);
+
+/**
+ * teedev_close_context() - closes a struct tee_context
+ * @ctx:	The struct tee_context to close
+ */
+void teedev_close_context(struct tee_context *ctx);
+
 #endif /*__TEE_DRV_H*/
-- 
cgit v1.2.3


From 49c39ec4670a8f045729e3717af2e1a74caf89a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Mon, 13 Sep 2021 14:21:25 +0200
Subject: dma-buf: nuke dma_resv_get_excl_unlocked
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Heureka, that's finally not used any more.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210917123513.1106-27-christian.koenig@amd.com
---
 include/linux/dma-resv.h | 26 --------------------------
 1 file changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index 09c6063b199a..eebf04325b34 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -440,32 +440,6 @@ dma_resv_excl_fence(struct dma_resv *obj)
 	return rcu_dereference_check(obj->fence_excl, dma_resv_held(obj));
 }
 
-/**
- * dma_resv_get_excl_unlocked - get the reservation object's
- * exclusive fence, without lock held.
- * @obj: the reservation object
- *
- * If there is an exclusive fence, this atomically increments it's
- * reference count and returns it.
- *
- * RETURNS
- * The exclusive fence or NULL if none
- */
-static inline struct dma_fence *
-dma_resv_get_excl_unlocked(struct dma_resv *obj)
-{
-	struct dma_fence *fence;
-
-	if (!rcu_access_pointer(obj->fence_excl))
-		return NULL;
-
-	rcu_read_lock();
-	fence = dma_fence_get_rcu_safe(&obj->fence_excl);
-	rcu_read_unlock();
-
-	return fence;
-}
-
 /**
  * dma_resv_shared_list - get the reservation object's shared fence list
  * @obj: the reservation object
-- 
cgit v1.2.3


From 2fb75e1b642f49253d8848c9e47e8942f5366221 Mon Sep 17 00:00:00 2001
From: Liu Xinpeng <liuxp11@chinatelecom.cn>
Date: Mon, 25 Oct 2021 11:46:26 +0800
Subject: psi: Add a missing SPDX license header

Add the missing SPDX license header to
include/linux/psi.h
include/linux/psi_types.h
kernel/sched/psi.c

Signed-off-by: Liu Xinpeng <liuxp11@chinatelecom.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/1635133586-84611-2-git-send-email-liuxp11@chinatelecom.cn
---
 include/linux/psi.h       | 1 +
 include/linux/psi_types.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 65eb1476ac70..a70ca833c6d7 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_PSI_H
 #define _LINUX_PSI_H
 
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 0a23300d49af..bf50068d5d4b 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_PSI_TYPES_H
 #define _LINUX_PSI_TYPES_H
 
-- 
cgit v1.2.3


From 4feee7d12603deca8775f9f9ae5e121093837444 Mon Sep 17 00:00:00 2001
From: Josh Don <joshdon@google.com>
Date: Mon, 18 Oct 2021 13:34:28 -0700
Subject: sched/core: Forced idle accounting

Adds accounting for "forced idle" time, which is time where a cookie'd
task forces its SMT sibling to idle, despite the presence of runnable
tasks.

Forced idle time is one means to measure the cost of enabling core
scheduling (ie. the capacity lost due to the need to force idle).

Forced idle time is attributed to the thread responsible for causing
the forced idle.

A few details:
 - Forced idle time is displayed via /proc/PID/sched. It also requires
   that schedstats is enabled.
 - Forced idle is only accounted when a sibling hyperthread is held
   idle despite the presence of runnable tasks. No time is charged if
   a sibling is idle but has no runnable tasks.
 - Tasks with 0 cookie are never charged forced idle.
 - For SMT > 2, we scale the amount of forced idle charged based on the
   number of forced idle siblings. Additionally, we split the time up and
   evenly charge it to all running tasks, as each is equally responsible
   for the forced idle.

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211018203428.2025792-1-joshdon@google.com
---
 include/linux/sched.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78c351e35fec..d2e261adb8ea 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,7 +523,11 @@ struct sched_statistics {
 	u64				nr_wakeups_affine_attempts;
 	u64				nr_wakeups_passive;
 	u64				nr_wakeups_idle;
+
+#ifdef CONFIG_SCHED_CORE
+	u64				core_forceidle_sum;
 #endif
+#endif /* CONFIG_SCHEDSTATS */
 } ____cacheline_aligned;
 
 struct sched_entity {
-- 
cgit v1.2.3


From cb0e52b7748737b2cf6481fdd9b920ce7e1ebbdf Mon Sep 17 00:00:00 2001
From: Brian Chen <brianchen118@gmail.com>
Date: Wed, 10 Nov 2021 21:33:12 +0000
Subject: psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing
 reclaim

We've noticed cases where tasks in a cgroup are stalled on memory but
there is little memory FULL pressure since tasks stay on the runqueue
in reclaim.

A simple example involves a single threaded program that keeps leaking
and touching large amounts of memory. It runs in a cgroup with swap
enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though
there is significant CPU pressure and memory SOME, there is barely any
memory FULL since the task enters reclaim and stays on the runqueue.
However, this memory-bound task is effectively stalled on memory and
we expect memory FULL to match memory SOME in this scenario.

The code is confused about memstall && running, thinking there is a
stalled task and a productive task when there's only one task: a
reclaimer that's counted as both. To fix this, we redefine the
condition for PSI_MEM_FULL to check that all running tasks are in an
active memstall instead of checking that there are no running tasks.

        case PSI_MEM_FULL:
-               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+               return unlikely(tasks[NR_MEMSTALL] &&
+                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);

This will capture reclaimers. It will also capture tasks that called
psi_memstall_enter() and are about to sleep, but this should be
negligible noise.

Signed-off-by: Brian Chen <brianchen118@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com
---
 include/linux/psi_types.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index bf50068d5d4b..516c0fe836fd 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -22,7 +22,17 @@ enum psi_task_count {
 	 * don't have to special case any state tracking for it.
 	 */
 	NR_ONCPU,
-	NR_PSI_TASK_COUNTS = 4,
+	/*
+	 * For IO and CPU stalls the presence of running/oncpu tasks
+	 * in the domain means a partial rather than a full stall.
+	 * For memory it's not so simple because of page reclaimers:
+	 * they are running/oncpu while representing a stall. To tell
+	 * whether a domain has productivity left or not, we need to
+	 * distinguish between regular running (i.e. productive)
+	 * threads and memstall ones.
+	 */
+	NR_MEMSTALL_RUNNING,
+	NR_PSI_TASK_COUNTS = 5,
 };
 
 /* Task state bitmasks */
@@ -30,6 +40,7 @@ enum psi_task_count {
 #define TSK_MEMSTALL	(1 << NR_MEMSTALL)
 #define TSK_RUNNING	(1 << NR_RUNNING)
 #define TSK_ONCPU	(1 << NR_ONCPU)
+#define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)
 
 /* Resources that workloads could be stalled on */
 enum psi_res {
-- 
cgit v1.2.3


From ff083a2d972f56bebfd82409ca62e5dfce950961 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 11 Nov 2021 02:07:22 +0000
Subject: perf: Protect perf_guest_cbs with RCU

Protect perf_guest_cbs with RCU to fix multiple possible errors.  Luckily,
all paths that read perf_guest_cbs already require RCU protection, e.g. to
protect the callback chains, so only the direct perf_guest_cbs touchpoints
need to be modified.

Bug #1 is a simple lack of WRITE_ONCE/READ_ONCE behavior to ensure
perf_guest_cbs isn't reloaded between a !NULL check and a dereference.
Fixed via the READ_ONCE() in rcu_dereference().

Bug #2 is that on weakly-ordered architectures, updates to the callbacks
themselves are not guaranteed to be visible before the pointer is made
visible to readers.  Fixed by the smp_store_release() in
rcu_assign_pointer() when the new pointer is non-NULL.

Bug #3 is that, because the callbacks are global, it's possible for
readers to run in parallel with an unregisters, and thus a module
implementing the callbacks can be unloaded while readers are in flight,
resulting in a use-after-free.  Fixed by a synchronize_rcu() call when
unregistering callbacks.

Bug #1 escaped notice because it's extremely unlikely a compiler will
reload perf_guest_cbs in this sequence.  perf_guest_cbs does get reloaded
for future derefs, e.g. for ->is_user_mode(), but the ->is_in_guest()
guard all but guarantees the consumer will win the race, e.g. to nullify
perf_guest_cbs, KVM has to completely exit the guest and teardown down
all VMs before KVM start its module unload / unregister sequence.  This
also makes it all but impossible to encounter bug #3.

Bug #2 has not been a problem because all architectures that register
callbacks are strongly ordered and/or have a static set of callbacks.

But with help, unloading kvm_intel can trigger bug #1 e.g. wrapping
perf_guest_cbs with READ_ONCE in perf_misc_flags() while spamming
kvm_intel module load/unload leads to:

  BUG: kernel NULL pointer dereference, address: 0000000000000000
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  PGD 0 P4D 0
  Oops: 0000 [#1] PREEMPT SMP
  CPU: 6 PID: 1825 Comm: stress Not tainted 5.14.0-rc2+ #459
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
  RIP: 0010:perf_misc_flags+0x1c/0x70
  Call Trace:
   perf_prepare_sample+0x53/0x6b0
   perf_event_output_forward+0x67/0x160
   __perf_event_overflow+0x52/0xf0
   handle_pmi_common+0x207/0x300
   intel_pmu_handle_irq+0xcf/0x410
   perf_event_nmi_handler+0x28/0x50
   nmi_handle+0xc7/0x260
   default_do_nmi+0x6b/0x170
   exc_nmi+0x103/0x130
   asm_exc_nmi+0x76/0xbf

Fixes: 39447b386c84 ("perf: Enhance perf to allow for guest statistic collection from host")
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20211111020738.2512932-2-seanjc@google.com
---
 include/linux/perf_event.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0dcfd265beed..318c489b735b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1240,7 +1240,18 @@ extern void perf_event_bpf_event(struct bpf_prog *prog,
 				 enum perf_bpf_event_type type,
 				 u16 flags);
 
-extern struct perf_guest_info_callbacks *perf_guest_cbs;
+extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
+static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void)
+{
+	/*
+	 * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading
+	 * the callbacks between a !NULL check and dereferences, to ensure
+	 * pending stores/changes to the callback pointers are visible before a
+	 * non-NULL perf_guest_cbs is visible to readers, and to prevent a
+	 * module from unloading callbacks while readers are active.
+	 */
+	return rcu_dereference(perf_guest_cbs);
+}
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 
-- 
cgit v1.2.3


From 2934e3d09350c1a7ca2433fbeabfcd831e48a575 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 11 Nov 2021 02:07:25 +0000
Subject: perf: Stop pretending that perf can handle multiple guest callbacks

Drop the 'int' return value from the perf (un)register callbacks helpers
and stop pretending perf can support multiple callbacks.  The 'int'
returns are not future proofing anything as none of the callers take
action on an error.  It's also not obvious that there will ever be
co-tenant hypervisors, and if there are, that allowing multiple callbacks
to be registered is desirable or even correct.

Opportunistically rename callbacks=>cbs in the affected declarations to
match their definitions.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/r/20211111020738.2512932-5-seanjc@google.com
---
 include/linux/perf_event.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 318c489b735b..98c204488496 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1252,8 +1252,8 @@ static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void)
 	 */
 	return rcu_dereference(perf_guest_cbs);
 }
-extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
-extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
+extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
+extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
 
 extern void perf_event_exec(void);
 extern void perf_event_comm(struct task_struct *tsk, bool exec);
@@ -1497,10 +1497,10 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)	{ }
 static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
-static inline int perf_register_guest_info_callbacks
-(struct perf_guest_info_callbacks *callbacks)				{ return 0; }
-static inline int perf_unregister_guest_info_callbacks
-(struct perf_guest_info_callbacks *callbacks)				{ return 0; }
+static inline void perf_register_guest_info_callbacks
+(struct perf_guest_info_callbacks *cbs)					{ }
+static inline void perf_unregister_guest_info_callbacks
+(struct perf_guest_info_callbacks *cbs)					{ }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 
-- 
cgit v1.2.3


From b9f5621c9547dd787900f005a9e1c3d5712de512 Mon Sep 17 00:00:00 2001
From: Like Xu <like.xu@linux.intel.com>
Date: Thu, 11 Nov 2021 02:07:27 +0000
Subject: perf/core: Rework guest callbacks to prepare for static_call support

To prepare for using static_calls to optimize perf's guest callbacks,
replace ->is_in_guest and ->is_user_mode with a new multiplexed hook
->state, tweak ->handle_intel_pt_intr to play nice with being called when
there is no active guest, and drop "guest" from ->get_guest_ip.

Return '0' from ->state and ->handle_intel_pt_intr to indicate "not in
guest" so that DEFINE_STATIC_CALL_RET0 can be used to define the static
calls, i.e. no callback == !guest.

[sean: extracted from static_call patch, fixed get_ip() bug, wrote changelog]
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Like Xu <like.xu@linux.intel.com>
Signed-off-by: Zhu Lingshan <lingshan.zhu@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/r/20211111020738.2512932-7-seanjc@google.com
---
 include/linux/perf_event.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 98c204488496..5e6b346d62a7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -26,11 +26,13 @@
 # include <asm/local64.h>
 #endif
 
+#define PERF_GUEST_ACTIVE	0x01
+#define PERF_GUEST_USER	0x02
+
 struct perf_guest_info_callbacks {
-	int				(*is_in_guest)(void);
-	int				(*is_user_mode)(void);
-	unsigned long			(*get_guest_ip)(void);
-	void				(*handle_intel_pt_intr)(void);
+	unsigned int			(*state)(void);
+	unsigned long			(*get_ip)(void);
+	unsigned int			(*handle_intel_pt_intr)(void);
 };
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-- 
cgit v1.2.3


From 1c3430516b0732d923de9fd3bfb3e2e537eeb235 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 11 Nov 2021 02:07:28 +0000
Subject: perf: Add wrappers for invoking guest callbacks

Add helpers for the guest callbacks to prepare for burying the callbacks
behind a Kconfig (it's a lot easier to provide a few stubs than to #ifdef
piles of code), and also to prepare for converting the callbacks to
static_call().  perf_instruction_pointer() in particular will have subtle
semantics with static_call(), as the "no callbacks" case will return 0 if
the callbacks are unregistered between querying guest state and getting
the IP.  Implement the change now to avoid a functional change when adding
static_call() support, and because the new helper needs to return
_something_ in this case.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/r/20211111020738.2512932-8-seanjc@google.com
---
 include/linux/perf_event.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5e6b346d62a7..346d5aff5804 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1254,6 +1254,30 @@ static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void)
 	 */
 	return rcu_dereference(perf_guest_cbs);
 }
+static inline unsigned int perf_guest_state(void)
+{
+	struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
+
+	return guest_cbs ? guest_cbs->state() : 0;
+}
+static inline unsigned long perf_guest_get_ip(void)
+{
+	struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
+
+	/*
+	 * Arbitrarily return '0' in the unlikely scenario that the callbacks
+	 * are unregistered between checking guest state and getting the IP.
+	 */
+	return guest_cbs ? guest_cbs->get_ip() : 0;
+}
+static inline unsigned int perf_guest_handle_intel_pt_intr(void)
+{
+	struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
+
+	if (guest_cbs && guest_cbs->handle_intel_pt_intr)
+		return guest_cbs->handle_intel_pt_intr();
+	return 0;
+}
 extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
 extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
 
-- 
cgit v1.2.3


From 2aef6f306b39bbe74e2287d6e2ee07c4867d87d0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 11 Nov 2021 02:07:29 +0000
Subject: perf: Force architectures to opt-in to guest callbacks

Introduce GUEST_PERF_EVENTS and require architectures to select it to
allow registering and using guest callbacks in perf.  This will hopefully
make it more difficult for new architectures to add useless "support" for
guest callbacks, e.g. via copy+paste.

Stubbing out the helpers has the happy bonus of avoiding a load of
perf_guest_cbs when GUEST_PERF_EVENTS=n on arm64/x86.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/r/20211111020738.2512932-9-seanjc@google.com
---
 include/linux/perf_event.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 346d5aff5804..ea47ef616ee0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1242,6 +1242,7 @@ extern void perf_event_bpf_event(struct bpf_prog *prog,
 				 enum perf_bpf_event_type type,
 				 u16 flags);
 
+#ifdef CONFIG_GUEST_PERF_EVENTS
 extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
 static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void)
 {
@@ -1280,6 +1281,11 @@ static inline unsigned int perf_guest_handle_intel_pt_intr(void)
 }
 extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
 extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
+#else
+static inline unsigned int perf_guest_state(void)		 { return 0; }
+static inline unsigned long perf_guest_get_ip(void)		 { return 0; }
+static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; }
+#endif /* CONFIG_GUEST_PERF_EVENTS */
 
 extern void perf_event_exec(void);
 extern void perf_event_comm(struct task_struct *tsk, bool exec);
-- 
cgit v1.2.3


From 87b940a0675e25261f022ac3e53e0dfff9cdb995 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 11 Nov 2021 02:07:30 +0000
Subject: perf/core: Use static_call to optimize perf_guest_info_callbacks

Use static_call to optimize perf's guest callbacks on arm64 and x86,
which are now the only architectures that define the callbacks.  Use
DEFINE_STATIC_CALL_RET0 as the default/NULL for all guest callbacks, as
the callback semantics are that a return value '0' means "not in guest".

static_call obviously avoids the overhead of CONFIG_RETPOLINE=y, but is
also advantageous versus other solutions, e.g. per-cpu callbacks, in that
a per-cpu memory load is not needed to detect the !guest case.

Based on code from Peter and Like.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/r/20211111020738.2512932-10-seanjc@google.com
---
 include/linux/perf_event.h | 34 ++++++++--------------------------
 1 file changed, 8 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ea47ef616ee0..0ac7d867ca0c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1244,40 +1244,22 @@ extern void perf_event_bpf_event(struct bpf_prog *prog,
 
 #ifdef CONFIG_GUEST_PERF_EVENTS
 extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
-static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void)
-{
-	/*
-	 * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading
-	 * the callbacks between a !NULL check and dereferences, to ensure
-	 * pending stores/changes to the callback pointers are visible before a
-	 * non-NULL perf_guest_cbs is visible to readers, and to prevent a
-	 * module from unloading callbacks while readers are active.
-	 */
-	return rcu_dereference(perf_guest_cbs);
-}
+
+DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state);
+DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
+DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
+
 static inline unsigned int perf_guest_state(void)
 {
-	struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
-
-	return guest_cbs ? guest_cbs->state() : 0;
+	return static_call(__perf_guest_state)();
 }
 static inline unsigned long perf_guest_get_ip(void)
 {
-	struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
-
-	/*
-	 * Arbitrarily return '0' in the unlikely scenario that the callbacks
-	 * are unregistered between checking guest state and getting the IP.
-	 */
-	return guest_cbs ? guest_cbs->get_ip() : 0;
+	return static_call(__perf_guest_get_ip)();
 }
 static inline unsigned int perf_guest_handle_intel_pt_intr(void)
 {
-	struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
-
-	if (guest_cbs && guest_cbs->handle_intel_pt_intr)
-		return guest_cbs->handle_intel_pt_intr();
-	return 0;
+	return static_call(__perf_guest_handle_intel_pt_intr)();
 }
 extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
 extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
-- 
cgit v1.2.3


From e1bfc24577cc65c95dc519d7621a9c985b97e567 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 11 Nov 2021 02:07:33 +0000
Subject: KVM: Move x86's perf guest info callbacks to generic KVM

Move x86's perf guest callbacks into common KVM, as they are semantically
identical to arm64's callbacks (the only other such KVM callbacks).
arm64 will convert to the common versions in a future patch.

Implement the necessary arm64 arch hooks now to avoid having to provide
stubs or a temporary #define (from x86) to avoid arm64 compilation errors
when CONFIG_GUEST_PERF_EVENTS=y.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211111020738.2512932-13-seanjc@google.com
---
 include/linux/kvm_host.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9e0667e3723e..9df7ab2d7530 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1170,6 +1170,16 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm)
 }
 #endif
 
+#ifdef CONFIG_GUEST_PERF_EVENTS
+unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu);
+
+void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void));
+void kvm_unregister_perf_callbacks(void);
+#else
+static inline void kvm_register_perf_callbacks(void *ign) {}
+static inline void kvm_unregister_perf_callbacks(void) {}
+#endif /* CONFIG_GUEST_PERF_EVENTS */
+
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
 void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
-- 
cgit v1.2.3


From a9f4a6e92b3b319296fb078da2615f618f6cd80c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 11 Nov 2021 02:07:38 +0000
Subject: perf: Drop guest callback (un)register stubs

Drop perf's stubs for (un)registering guest callbacks now that KVM
registration of callbacks is hidden behind GUEST_PERF_EVENTS=y.  The only
other user is x86 XEN_PV, and x86 unconditionally selects PERF_EVENTS.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/r/20211111020738.2512932-18-seanjc@google.com
---
 include/linux/perf_event.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0ac7d867ca0c..7b7525e9155f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1511,11 +1511,6 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)	{ }
 static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
-static inline void perf_register_guest_info_callbacks
-(struct perf_guest_info_callbacks *cbs)					{ }
-static inline void perf_unregister_guest_info_callbacks
-(struct perf_guest_info_callbacks *cbs)					{ }
-
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 
 typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
-- 
cgit v1.2.3


From f45b2974cc0ae959a4c503a071e38a56bd64372f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= <bjorn@kernel.org>
Date: Wed, 17 Nov 2021 13:57:08 +0100
Subject: bpf, x86: Fix "no previous prototype" warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The arch_prepare_bpf_dispatcher function does not have a prototype, and
yields the following warning when W=1 is enabled for the kernel build.

  >> arch/x86/net/bpf_jit_comp.c:2188:5: warning: no previous \
  prototype for 'arch_prepare_bpf_dispatcher' [-Wmissing-prototypes]
        2188 | int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, \
	int num_funcs)
             |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~

Remove the warning by adding a function declaration to include/linux/bpf.h.

Fixes: 75ccbef6369e ("bpf: Introduce BPF dispatcher")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Björn Töpel <bjorn@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211117125708.769168-1-bjorn@kernel.org
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e7a163a3146b..84ff6ef49462 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -732,6 +732,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
 struct bpf_trampoline *bpf_trampoline_get(u64 key,
 					  struct bpf_attach_target_info *tgt_info);
 void bpf_trampoline_put(struct bpf_trampoline *tr);
+int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs);
 #define BPF_DISPATCHER_INIT(_name) {				\
 	.mutex = __MUTEX_INITIALIZER(_name.mutex),		\
 	.func = &_name##_func,					\
-- 
cgit v1.2.3


From cf9acc90c80ecbee00334aa85d92f4e74014bcff Mon Sep 17 00:00:00 2001
From: Jonathan Davies <jonathan.davies@nutanix.com>
Date: Tue, 16 Nov 2021 17:42:42 +0000
Subject: net: virtio_net_hdr_to_skb: count transport header in UFO

virtio_net_hdr_to_skb does not set the skb's gso_size and gso_type
correctly for UFO packets received via virtio-net that are a little over
the GSO size. This can lead to problems elsewhere in the networking
stack, e.g. ovs_vport_send dropping over-sized packets if gso_size is
not set.

This is due to the comparison

  if (skb->len - p_off > gso_size)

not properly accounting for the transport layer header.

p_off includes the size of the transport layer header (thlen), so
skb->len - p_off is the size of the TCP/UDP payload.

gso_size is read from the virtio-net header. For UFO, fragmentation
happens at the IP level so does not need to include the UDP header.

Hence the calculation could be comparing a TCP/UDP payload length with
an IP payload length, causing legitimate virtio-net packets to have
lack gso_type/gso_size information.

Example: a UDP packet with payload size 1473 has IP payload size 1481.
If the guest used UFO, it is not fragmented and the virtio-net header's
flags indicate that it is a GSO frame (VIRTIO_NET_HDR_GSO_UDP), with
gso_size = 1480 for an MTU of 1500.  skb->len will be 1515 and p_off
will be 42, so skb->len - p_off = 1473.  Hence the comparison fails, and
shinfo->gso_size and gso_type are not set as they should be.

Instead, add the UDP header length before comparing to gso_size when
using UFO. In this way, it is the size of the IP payload that is
compared to gso_size.

Fixes: 6dd912f82680 ("net: check untrusted gso_size at kernel entry")
Signed-off-by: Jonathan Davies <jonathan.davies@nutanix.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index b465f8f3e554..04e87f4b9417 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -120,10 +120,15 @@ retry:
 
 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
 		u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size);
+		unsigned int nh_off = p_off;
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
 
+		/* UFO may not include transport header in gso_size. */
+		if (gso_type & SKB_GSO_UDP)
+			nh_off -= thlen;
+
 		/* Too small packets are not really GSO ones. */
-		if (skb->len - p_off > gso_size) {
+		if (skb->len - nh_off > gso_size) {
 			shinfo->gso_size = gso_size;
 			shinfo->gso_type = gso_type;
 
-- 
cgit v1.2.3


From 8160fb43d55d26d64607fd32fe69185a5f5fe41f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 16 Nov 2021 19:29:21 -0800
Subject: net: use an atomic_long_t for queue->trans_timeout

tx_timeout_show() assumed dev_watchdog() would stop all
the queues, to fetch queue->trans_timeout under protection
of the queue->_xmit_lock.

As we want to no longer disrupt transmits, we use an
atomic_long_t instead.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: david decotigny <david.decotigny@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dd328364dfe9..1d22483cf78c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -592,7 +592,7 @@ struct netdev_queue {
 	 * Number of TX timeouts for this queue
 	 * (/sys/class/net/DEV/Q/trans_timeout)
 	 */
-	unsigned long		trans_timeout;
+	atomic_long_t		trans_timeout;
 
 	/* Subordinate device that the queue has been assigned to */
 	struct net_device	*sb_dev;
-- 
cgit v1.2.3


From 5337824f4dc4bb26f38fbbba4ffb425a92803f15 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 16 Nov 2021 19:29:22 -0800
Subject: net: annotate accesses to queue->trans_start

In following patches, dev_watchdog() will no longer stop all queues.
It will read queue->trans_start locklessly.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1d22483cf78c..279409ef2b18 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4095,10 +4095,21 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
 	spin_unlock_bh(&txq->_xmit_lock);
 }
 
+/*
+ * txq->trans_start can be read locklessly from dev_watchdog()
+ */
 static inline void txq_trans_update(struct netdev_queue *txq)
 {
 	if (txq->xmit_lock_owner != -1)
-		txq->trans_start = jiffies;
+		WRITE_ONCE(txq->trans_start, jiffies);
+}
+
+static inline void txq_trans_cond_update(struct netdev_queue *txq)
+{
+	unsigned long now = jiffies;
+
+	if (READ_ONCE(txq->trans_start) != now)
+		WRITE_ONCE(txq->trans_start, now);
 }
 
 /* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
@@ -4106,8 +4117,7 @@ static inline void netif_trans_update(struct net_device *dev)
 {
 	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
 
-	if (txq->trans_start != jiffies)
-		txq->trans_start = jiffies;
+	txq_trans_cond_update(txq);
 }
 
 /**
-- 
cgit v1.2.3


From dab8fe320726b38a6b1dc6a7ca6e386c5f7779e8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 16 Nov 2021 19:29:23 -0800
Subject: net: do not inline netif_tx_lock()/netif_tx_unlock()

These are not fast path, there is no point in inlining them.

Also provide netif_freeze_queues()/netif_unfreeze_queues()
so that we can use them from dev_watchdog() in the following patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 39 ++-------------------------------------
 1 file changed, 2 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 279409ef2b18..4f4a299e92de 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4126,27 +4126,7 @@ static inline void netif_trans_update(struct net_device *dev)
  *
  * Get network device transmit lock
  */
-static inline void netif_tx_lock(struct net_device *dev)
-{
-	unsigned int i;
-	int cpu;
-
-	spin_lock(&dev->tx_global_lock);
-	cpu = smp_processor_id();
-	for (i = 0; i < dev->num_tx_queues; i++) {
-		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-
-		/* We are the only thread of execution doing a
-		 * freeze, but we have to grab the _xmit_lock in
-		 * order to synchronize with threads which are in
-		 * the ->hard_start_xmit() handler and already
-		 * checked the frozen bit.
-		 */
-		__netif_tx_lock(txq, cpu);
-		set_bit(__QUEUE_STATE_FROZEN, &txq->state);
-		__netif_tx_unlock(txq);
-	}
-}
+void netif_tx_lock(struct net_device *dev);
 
 static inline void netif_tx_lock_bh(struct net_device *dev)
 {
@@ -4154,22 +4134,7 @@ static inline void netif_tx_lock_bh(struct net_device *dev)
 	netif_tx_lock(dev);
 }
 
-static inline void netif_tx_unlock(struct net_device *dev)
-{
-	unsigned int i;
-
-	for (i = 0; i < dev->num_tx_queues; i++) {
-		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-
-		/* No need to grab the _xmit_lock here.  If the
-		 * queue is not stopped for another reason, we
-		 * force a schedule.
-		 */
-		clear_bit(__QUEUE_STATE_FROZEN, &txq->state);
-		netif_schedule_queue(txq);
-	}
-	spin_unlock(&dev->tx_global_lock);
-}
+void netif_tx_unlock(struct net_device *dev);
 
 static inline void netif_tx_unlock_bh(struct net_device *dev)
 {
-- 
cgit v1.2.3


From 1881eadb2041889d74d60c074eb04189c4a07dad Mon Sep 17 00:00:00 2001
From: Abhyuday Godhasara <abhyuday.godhasara@xilinx.com>
Date: Mon, 25 Oct 2021 21:25:20 -0700
Subject: firmware: xilinx: add register notifier in zynqmp firmware

In zynqmp-firmware, register notifier is not supported, add support of
register notifier in zynqmp-firmware.

Acked-by: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: Tejas Patel <tejas.patel@xilinx.com>
Signed-off-by: Abhyuday Godhasara <abhyuday.godhasara@xilinx.com>
Link: https://lore.kernel.org/r/20211026042525.26612-2-abhyuday.godhasara@xilinx.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 47fd4e52a423..d30d39dc8cb4 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -2,7 +2,7 @@
 /*
  * Xilinx Zynq MPSoC Firmware layer
  *
- *  Copyright (C) 2014-2019 Xilinx
+ *  Copyright (C) 2014-2021 Xilinx
  *
  *  Michal Simek <michal.simek@xilinx.com>
  *  Davorin Mista <davorin.mista@aggios.com>
@@ -66,6 +66,7 @@
 
 enum pm_api_id {
 	PM_GET_API_VERSION = 1,
+	PM_REGISTER_NOTIFIER = 5,
 	PM_SYSTEM_SHUTDOWN = 12,
 	PM_REQUEST_NODE = 13,
 	PM_RELEASE_NODE = 14,
@@ -427,6 +428,8 @@ int zynqmp_pm_pinctrl_get_config(const u32 pin, const u32 param,
 int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param,
 				 u32 value);
 int zynqmp_pm_load_pdi(const u32 src, const u64 address);
+int zynqmp_pm_register_notifier(const u32 node, const u32 event,
+				const u32 wake, const u32 enable);
 #else
 static inline int zynqmp_pm_get_api_version(u32 *version)
 {
@@ -658,6 +661,12 @@ static inline int zynqmp_pm_load_pdi(const u32 src, const u64 address)
 {
 	return -ENODEV;
 }
+
+static inline int zynqmp_pm_register_notifier(const u32 node, const u32 event,
+					      const u32 wake, const u32 enable)
+{
+	return -ENODEV;
+}
 #endif
 
 #endif /* __FIRMWARE_ZYNQMP_H__ */
-- 
cgit v1.2.3


From fbce9f14055e547d270046f61758c29c957e675d Mon Sep 17 00:00:00 2001
From: Abhyuday Godhasara <abhyuday.godhasara@xilinx.com>
Date: Mon, 25 Oct 2021 21:25:21 -0700
Subject: firmware: xilinx: add macros of node ids for error event

Add macros for the Node-Id of Error events.

Move supported api callback ids from zynqmp-power to zynqmp-firmware.

Acked-by: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: Rajan Vaja <rajan.vaja@xilinx.com>
Signed-off-by: Abhyuday Godhasara <abhyuday.godhasara@xilinx.com>
Link: https://lore.kernel.org/r/20211026042525.26612-3-abhyuday.godhasara@xilinx.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index d30d39dc8cb4..b0a38091db71 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -64,6 +64,20 @@
 #define XILINX_ZYNQMP_PM_FPGA_FULL	0x0U
 #define XILINX_ZYNQMP_PM_FPGA_PARTIAL	BIT(0)
 
+/*
+ * Node IDs for the Error Events.
+ */
+#define EVENT_ERROR_PMC_ERR1	(0x28100000U)
+#define EVENT_ERROR_PMC_ERR2	(0x28104000U)
+#define EVENT_ERROR_PSM_ERR1	(0x28108000U)
+#define EVENT_ERROR_PSM_ERR2	(0x2810C000U)
+
+enum pm_api_cb_id {
+	PM_INIT_SUSPEND_CB = 30,
+	PM_ACKNOWLEDGE_CB = 31,
+	PM_NOTIFY_CB = 32,
+};
+
 enum pm_api_id {
 	PM_GET_API_VERSION = 1,
 	PM_REGISTER_NOTIFIER = 5,
-- 
cgit v1.2.3


From f4d77525679e289d4976ca03b620ac4cc5403205 Mon Sep 17 00:00:00 2001
From: Abhyuday Godhasara <abhyuday.godhasara@xilinx.com>
Date: Mon, 25 Oct 2021 21:25:22 -0700
Subject: firmware: xilinx: export the feature check of zynqmp firmware

Export the zynqmp_pm_feature(), so it can be use by other as to get API
version available in firmware.

Acked-by: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: Rajan Vaja <rajan.vaja@xilinx.com>
Signed-off-by: Abhyuday Godhasara <abhyuday.godhasara@xilinx.com>
Link: https://lore.kernel.org/r/20211026042525.26612-4-abhyuday.godhasara@xilinx.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index b0a38091db71..077e894bb340 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -444,6 +444,7 @@ int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param,
 int zynqmp_pm_load_pdi(const u32 src, const u64 address);
 int zynqmp_pm_register_notifier(const u32 node, const u32 event,
 				const u32 wake, const u32 enable);
+int zynqmp_pm_feature(const u32 api_id);
 #else
 static inline int zynqmp_pm_get_api_version(u32 *version)
 {
@@ -681,6 +682,11 @@ static inline int zynqmp_pm_register_notifier(const u32 node, const u32 event,
 {
 	return -ENODEV;
 }
+
+static inline int zynqmp_pm_feature(const u32 api_id)
+{
+	return -ENODEV;
+}
 #endif
 
 #endif /* __FIRMWARE_ZYNQMP_H__ */
-- 
cgit v1.2.3


From 522a0032af005502507f5f81ae64fdcc82b5d068 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 6 Nov 2021 17:13:35 -0400
Subject: Add linux/cacheflush.h

Many architectures do not include asm-generic/cacheflush.h, so turn
the includes on their head and add linux/cacheflush.h which includes
asm/cacheflush.h.

Move the flush_dcache_folio() declaration from asm-generic/cacheflush.h
to linux/cacheflush.h and change linux/highmem.h to include
linux/cacheflush.h instead of asm/cacheflush.h so that all necessary
places will see flush_dcache_folio().

More functions should have their default implementations moved in the
future, but those are for follow-on patches.  This fixes csky, sparc and
sparc64 which were missed in the commit which added flush_dcache_folio().

Fixes: 08b0b0059bf1 ("mm: Add flush_dcache_folio()")
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 include/linux/cacheflush.h | 18 ++++++++++++++++++
 include/linux/highmem.h    |  3 +--
 2 files changed, 19 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/cacheflush.h

(limited to 'include/linux')

diff --git a/include/linux/cacheflush.h b/include/linux/cacheflush.h
new file mode 100644
index 000000000000..fef8b607f97e
--- /dev/null
+++ b/include/linux/cacheflush.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CACHEFLUSH_H
+#define _LINUX_CACHEFLUSH_H
+
+#include <asm/cacheflush.h>
+
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
+void flush_dcache_folio(struct folio *folio);
+#endif
+#else
+static inline void flush_dcache_folio(struct folio *folio)
+{
+}
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO 0
+#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */
+
+#endif /* _LINUX_CACHEFLUSH_H */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 25aff0f2ed0b..c944b3b70ee7 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -5,12 +5,11 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/bug.h>
+#include <linux/cacheflush.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 
-#include <asm/cacheflush.h>
-
 #include "highmem-internal.h"
 
 /**
-- 
cgit v1.2.3


From 9c3252152e8a6401c2b9e32490a5a16ec4472778 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 16 Nov 2021 21:17:14 -0500
Subject: mm: Rename folio_test_multi to folio_test_large

This is a better name.  Also add kernel-doc.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/page-flags.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 52ec4b5e5615..05510118fbb8 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -692,7 +692,13 @@ static inline bool folio_test_single(struct folio *folio)
 	return !folio_test_head(folio);
 }
 
-static inline bool folio_test_multi(struct folio *folio)
+/**
+ * folio_test_large() - Does this folio contain more than one page?
+ * @folio: The folio to test.
+ *
+ * Return: True if the folio is larger than one page.
+ */
+static inline bool folio_test_large(struct folio *folio)
 {
 	return folio_test_head(folio);
 }
-- 
cgit v1.2.3


From a1efe484dd8c04c4c2d4eb1ee6b04d01cfc07ccc Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 16 Nov 2021 21:18:52 -0500
Subject: mm: Remove folio_test_single

There's no need for this predicate; callers can just use
!folio_test_large().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/page-flags.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 05510118fbb8..b5f14d581113 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -686,12 +686,6 @@ static inline bool test_set_page_writeback(struct page *page)
 
 __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
 
-/* Whether there are one or multiple pages in a folio */
-static inline bool folio_test_single(struct folio *folio)
-{
-	return !folio_test_head(folio);
-}
-
 /**
  * folio_test_large() - Does this folio contain more than one page?
  * @folio: The folio to test.
-- 
cgit v1.2.3


From ff36da69bc90d80b0c73f47f4b2e270b3ff6da99 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 29 Aug 2021 06:07:03 -0400
Subject: fs: Remove FS_THP_SUPPORT

Instead of setting a bit in the fs_flags to set a bit in the
address_space, set the bit in the address_space directly.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/fs.h      |  1 -
 include/linux/pagemap.h | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1cb616fc1105..bbf812ce89a8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2518,7 +2518,6 @@ struct file_system_type {
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_DISALLOW_NOTIFY_PERM	16	/* Disable fanotify permission events */
 #define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
-#define FS_THP_SUPPORT		8192	/* Remove once all fs converted */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
 	const struct fs_parameter_spec *parameters;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 1a0c646eb6ff..9e33878bf23b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -176,6 +176,22 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 	m->gfp_mask = mask;
 }
 
+/**
+ * mapping_set_large_folios() - Indicate the file supports large folios.
+ * @mapping: The file.
+ *
+ * The filesystem should call this function in its inode constructor to
+ * indicate that the VFS can use large folios to cache the contents of
+ * the file.
+ *
+ * Context: This should not be called while the inode is active as it
+ * is non-atomic.
+ */
+static inline void mapping_set_large_folios(struct address_space *mapping)
+{
+	__set_bit(AS_THP_SUPPORT, &mapping->flags);
+}
+
 static inline bool mapping_thp_support(struct address_space *mapping)
 {
 	return test_bit(AS_THP_SUPPORT, &mapping->flags);
-- 
cgit v1.2.3


From ed2145c474c9015bc634e35f6d1a9b7767f3fbfc Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 29 Aug 2021 06:28:19 -0400
Subject: fs: Rename AS_THP_SUPPORT and mapping_thp_support

These are now indicators of large folio support, not THP support.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/pagemap.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 9e33878bf23b..605246452305 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -84,7 +84,7 @@ enum mapping_flags {
 	AS_EXITING	= 4, 	/* final truncate in progress */
 	/* writeback related tags are not used */
 	AS_NO_WRITEBACK_TAGS = 5,
-	AS_THP_SUPPORT = 6,	/* THPs supported */
+	AS_LARGE_FOLIO_SUPPORT = 6,
 };
 
 /**
@@ -189,12 +189,12 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
  */
 static inline void mapping_set_large_folios(struct address_space *mapping)
 {
-	__set_bit(AS_THP_SUPPORT, &mapping->flags);
+	__set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
 }
 
-static inline bool mapping_thp_support(struct address_space *mapping)
+static inline bool mapping_large_folio_support(struct address_space *mapping)
 {
-	return test_bit(AS_THP_SUPPORT, &mapping->flags);
+	return test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
 }
 
 static inline int filemap_nr_thps(struct address_space *mapping)
@@ -209,7 +209,7 @@ static inline int filemap_nr_thps(struct address_space *mapping)
 static inline void filemap_nr_thps_inc(struct address_space *mapping)
 {
 #ifdef CONFIG_READ_ONLY_THP_FOR_FS
-	if (!mapping_thp_support(mapping))
+	if (!mapping_large_folio_support(mapping))
 		atomic_inc(&mapping->nr_thps);
 #else
 	WARN_ON_ONCE(1);
@@ -219,7 +219,7 @@ static inline void filemap_nr_thps_inc(struct address_space *mapping)
 static inline void filemap_nr_thps_dec(struct address_space *mapping)
 {
 #ifdef CONFIG_READ_ONLY_THP_FOR_FS
-	if (!mapping_thp_support(mapping))
+	if (!mapping_large_folio_support(mapping))
 		atomic_dec(&mapping->nr_thps);
 #else
 	WARN_ON_ONCE(1);
-- 
cgit v1.2.3


From 5768d8906bc23d512b1a736c1e198aa833a6daa4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 15 Nov 2021 13:47:13 -0600
Subject: signal: Requeue signals in the appropriate queue

In the event that a tracer changes which signal needs to be delivered
and that signal is currently blocked then the signal needs to be
requeued for later delivery.

With the advent of CLONE_THREAD the kernel has 2 signal queues per
task.  The per process queue and the per task queue.  Update the code
so that if the signal is removed from the per process queue it is
requeued on the per process queue.  This is necessary to make it
appear the signal was never dequeued.

The rr debugger reasonably believes that the state of the process from
the last ptrace_stop it observed until PTRACE_EVENT_EXIT can be recreated
by simply letting a process run.  If a SIGKILL interrupts a ptrace_stop
this is not true today.

So return signals to their original queue in ptrace_signal so that
signals that are not delivered appear like they were never dequeued.

Fixes: 794aa320b79d ("[PATCH] sigfix-2.5.40-D6")
History Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.gi
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/87zgq4d5r4.fsf_-_@email.froward.int.ebiederm.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/signal.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 23505394ef70..167995d471da 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -286,17 +286,18 @@ static inline int signal_group_exit(const struct signal_struct *sig)
 extern void flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
-extern int dequeue_signal(struct task_struct *task,
-			  sigset_t *mask, kernel_siginfo_t *info);
+extern int dequeue_signal(struct task_struct *task, sigset_t *mask,
+			  kernel_siginfo_t *info, enum pid_type *type);
 
 static inline int kernel_dequeue_signal(void)
 {
 	struct task_struct *task = current;
 	kernel_siginfo_t __info;
+	enum pid_type __type;
 	int ret;
 
 	spin_lock_irq(&task->sighand->siglock);
-	ret = dequeue_signal(task, &task->blocked, &__info);
+	ret = dequeue_signal(task, &task->blocked, &__info, &__type);
 	spin_unlock_irq(&task->sighand->siglock);
 
 	return ret;
-- 
cgit v1.2.3


From e0dbd7b0ed021fb9250f7ba4d759325678efefb5 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 16 Nov 2021 23:44:28 +0100
Subject: power: supply: core: Add kerneldoc to battery struct

This complements the struct power_supply_battery_info with
extensive kerneldoc explaining the different semantics of the
fields, including an overview of the CC/CV charging concepts
implicit in some of the struct members.

This is done to first establish semantics before I can
add more charging methods by breaking out the CC/CV parameters
to its own struct.

Tested-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power_supply.h | 215 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 192 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 2d1318fe2455..f6e94eae4f28 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -343,37 +343,206 @@ struct power_supply_resistance_temp_table {
 
 #define POWER_SUPPLY_OCV_TEMP_MAX 20
 
-/*
+/**
+ * struct power_supply_battery_info - information about batteries
+ * @technology: from the POWER_SUPPLY_TECHNOLOGY_* enum
+ * @energy_full_design_uwh: energy content when fully charged in microwatt
+ *   hours
+ * @charge_full_design_uah: charge content when fully charged in microampere
+ *   hours
+ * @voltage_min_design_uv: minimum voltage across the poles when the battery
+ *   is at minimum voltage level in microvolts. If the voltage drops below this
+ *   level the battery will need precharging when using CC/CV charging.
+ * @voltage_max_design_uv: voltage across the poles when the battery is fully
+ *   charged in microvolts. This is the "nominal voltage" i.e. the voltage
+ *   printed on the label of the battery.
+ * @tricklecharge_current_ua: the tricklecharge current used when trickle
+ *   charging the battery in microamperes. This is the charging phase when the
+ *   battery is completely empty and we need to carefully trickle in some
+ *   charge until we reach the precharging voltage.
+ * @precharge_current_ua: current to use in the precharge phase in microamperes,
+ *   the precharge rate is limited by limiting the current to this value.
+ * @precharge_voltage_max_uv: the maximum voltage allowed when precharging in
+ *   microvolts. When we pass this voltage we will nominally switch over to the
+ *   CC (constant current) charging phase defined by constant_charge_current_ua
+ *   and constant_charge_voltage_max_uv.
+ * @charge_term_current_ua: when the current in the CV (constant voltage)
+ *   charging phase drops below this value in microamperes the charging will
+ *   terminate completely and not restart until the voltage over the battery
+ *   poles reach charge_restart_voltage_uv unless we use maintenance charging.
+ * @charge_restart_voltage_uv: when the battery has been fully charged by
+ *   CC/CV charging and charging has been disabled, and the voltage subsequently
+ *   drops below this value in microvolts, the charging will be restarted
+ *   (typically using CV charging).
+ * @overvoltage_limit_uv: If the voltage exceeds the nominal voltage
+ *   voltage_max_design_uv and we reach this voltage level, all charging must
+ *   stop and emergency procedures take place, such as shutting down the system
+ *   in some cases.
+ * @constant_charge_current_max_ua: current in microamperes to use in the CC
+ *   (constant current) charging phase. The charging rate is limited
+ *   by this current. This is the main charging phase and as the current is
+ *   constant into the battery the voltage slowly ascends to
+ *   constant_charge_voltage_max_uv.
+ * @constant_charge_voltage_max_uv: voltage in microvolts signifying the end of
+ *   the CC (constant current) charging phase and the beginning of the CV
+ *   (constant voltage) charging phase.
+ * @factory_internal_resistance_uohm: the internal resistance of the battery
+ *   at fabrication time, expressed in microohms. This resistance will vary
+ *   depending on the lifetime and charge of the battery, so this is just a
+ *   nominal ballpark figure.
+ * @ocv_temp: array indicating the open circuit voltage (OCV) capacity
+ *   temperature indices. This is an array of temperatures in degrees Celsius
+ *   indicating which capacity table to use for a certain temperature, since
+ *   the capacity for reasons of chemistry will be different at different
+ *   temperatures. Determining capacity is a multivariate problem and the
+ *   temperature is the first variable we determine.
+ * @temp_ambient_alert_min: the battery will go outside of operating conditions
+ *   when the ambient temperature goes below this temperature in degrees
+ *   Celsius.
+ * @temp_ambient_alert_max: the battery will go outside of operating conditions
+ *   when the ambient temperature goes above this temperature in degrees
+ *   Celsius.
+ * @temp_alert_min: the battery should issue an alert if the internal
+ *   temperature goes below this temperature in degrees Celsius.
+ * @temp_alert_max: the battery should issue an alert if the internal
+ *   temperature goes above this temperature in degrees Celsius.
+ * @temp_min: the battery will go outside of operating conditions when
+ *   the internal temperature goes below this temperature in degrees Celsius.
+ *   Normally this means the system should shut down.
+ * @temp_max: the battery will go outside of operating conditions when
+ *   the internal temperature goes above this temperature in degrees Celsius.
+ *   Normally this means the system should shut down.
+ * @ocv_table: for each entry in ocv_temp there is a corresponding entry in
+ *   ocv_table and a size for each entry in ocv_table_size. These arrays
+ *   determine the capacity in percent in relation to the voltage in microvolts
+ *   at the indexed temperature.
+ * @ocv_table_size: for each entry in ocv_temp this array is giving the size of
+ *   each entry in the array of capacity arrays in ocv_table.
+ * @resist_table: this is a table that correlates a battery temperature to the
+ *   expected internal resistance at this temperature. The resistance is given
+ *   as a percentage of factory_internal_resistance_uohm. Knowing the
+ *   resistance of the battery is usually necessary for calculating the open
+ *   circuit voltage (OCV) that is then used with the ocv_table to calculate
+ *   the capacity of the battery. The resist_table must be ordered descending
+ *   by temperature: highest temperature with lowest resistance first, lowest
+ *   temperature with highest resistance last.
+ * @resist_table_size: the number of items in the resist_table.
+ *
  * This is the recommended struct to manage static battery parameters,
  * populated by power_supply_get_battery_info(). Most platform drivers should
  * use these for consistency.
+ *
  * Its field names must correspond to elements in enum power_supply_property.
  * The default field value is -EINVAL.
- * Power supply class itself doesn't use this.
+ *
+ * The charging parameters here assume a CC/CV charging scheme. This method
+ * is most common with Lithium Ion batteries (other methods are possible) and
+ * looks as follows:
+ *
+ * ^ Battery voltage
+ * |                                               --- overvoltage_limit_uv
+ * |
+ * |                    ...................................................
+ * |                 .. constant_charge_voltage_max_uv
+ * |              ..
+ * |             .
+ * |            .
+ * |           .
+ * |          .
+ * |         .
+ * |     .. precharge_voltage_max_uv
+ * |  ..
+ * |. (trickle charging)
+ * +------------------------------------------------------------------> time
+ *
+ * ^ Current into the battery
+ * |
+ * |      ............. constant_charge_current_max_ua
+ * |      .            .
+ * |      .             .
+ * |      .              .
+ * |      .               .
+ * |      .                ..
+ * |      .                  ....
+ * |      .                       .....
+ * |    ... precharge_current_ua       .......  charge_term_current_ua
+ * |    .                                    .
+ * |    .                                    .
+ * |.... tricklecharge_current_ua            .
+ * |                                         .
+ * +-----------------------------------------------------------------> time
+ *
+ * These diagrams are synchronized on time and the voltage and current
+ * follow each other.
+ *
+ * With CC/CV charging commence over time like this for an empty battery:
+ *
+ * 1. When the battery is completely empty it may need to be charged with
+ *    an especially small current so that electrons just "trickle in",
+ *    this is the tricklecharge_current_ua.
+ *
+ * 2. Next a small initial pre-charge current (precharge_current_ua)
+ *    is applied if the voltage is below precharge_voltage_max_uv until we
+ *    reach precharge_voltage_max_uv. CAUTION: in some texts this is referred
+ *    to as "trickle charging" but the use in the Linux kernel is different
+ *    see below!
+ *
+ * 3. Then the main charging current is applied, which is called the constant
+ *    current (CC) phase. A current regulator is set up to allow
+ *    constant_charge_current_max_ua of current to flow into the battery.
+ *    The chemical reaction in the battery will make the voltage go up as
+ *    charge goes into the battery. This current is applied until we reach
+ *    the constant_charge_voltage_max_uv voltage.
+ *
+ * 4. At this voltage we switch over to the constant voltage (CV) phase. This
+ *    means we allow current to go into the battery, but we keep the voltage
+ *    fixed. This current will continue to charge the battery while keeping
+ *    the voltage the same. A chemical reaction in the battery goes on
+ *    storing energy without affecting the voltage. Over time the current
+ *    will slowly drop and when we reach charge_term_current_ua we will
+ *    end the constant voltage phase.
+ *
+ * After this the battery is fully charged, and if we do not support maintenance
+ * charging, the charging will not restart until power dissipation makes the
+ * voltage fall so that we reach charge_restart_voltage_uv and at this point
+ * we restart charging at the appropriate phase, usually this will be inside
+ * the CV phase.
+ *
+ * If we support maintenance charging the voltage is however kept high after
+ * the CV phase with a very low current. This is meant to let the same charge
+ * go in for usage while the charger is still connected, mainly for
+ * dissipation for the power consuming entity while connected to the
+ * charger.
+ *
+ * All charging MUST terminate if the overvoltage_limit_uv is ever reached.
+ * Overcharging Lithium Ion cells can be DANGEROUS and lead to fire or
+ * explosions.
+ *
+ * The power supply class itself doesn't use this struct as of now.
  */
 
 struct power_supply_battery_info {
-	unsigned int technology;	    /* from the enum above */
-	int energy_full_design_uwh;	    /* microWatt-hours */
-	int charge_full_design_uah;	    /* microAmp-hours */
-	int voltage_min_design_uv;	    /* microVolts */
-	int voltage_max_design_uv;	    /* microVolts */
-	int tricklecharge_current_ua;	    /* microAmps */
-	int precharge_current_ua;	    /* microAmps */
-	int precharge_voltage_max_uv;	    /* microVolts */
-	int charge_term_current_ua;	    /* microAmps */
-	int charge_restart_voltage_uv;	    /* microVolts */
-	int overvoltage_limit_uv;	    /* microVolts */
-	int constant_charge_current_max_ua; /* microAmps */
-	int constant_charge_voltage_max_uv; /* microVolts */
-	int factory_internal_resistance_uohm;   /* microOhms */
-	int ocv_temp[POWER_SUPPLY_OCV_TEMP_MAX];/* celsius */
-	int temp_ambient_alert_min;             /* celsius */
-	int temp_ambient_alert_max;             /* celsius */
-	int temp_alert_min;                     /* celsius */
-	int temp_alert_max;                     /* celsius */
-	int temp_min;                           /* celsius */
-	int temp_max;                           /* celsius */
+	unsigned int technology;
+	int energy_full_design_uwh;
+	int charge_full_design_uah;
+	int voltage_min_design_uv;
+	int voltage_max_design_uv;
+	int tricklecharge_current_ua;
+	int precharge_current_ua;
+	int precharge_voltage_max_uv;
+	int charge_term_current_ua;
+	int charge_restart_voltage_uv;
+	int overvoltage_limit_uv;
+	int constant_charge_current_max_ua;
+	int constant_charge_voltage_max_uv;
+	int factory_internal_resistance_uohm;
+	int ocv_temp[POWER_SUPPLY_OCV_TEMP_MAX];
+	int temp_ambient_alert_min;
+	int temp_ambient_alert_max;
+	int temp_alert_min;
+	int temp_alert_max;
+	int temp_min;
+	int temp_max;
 	struct power_supply_battery_ocv_table *ocv_table[POWER_SUPPLY_OCV_TEMP_MAX];
 	int ocv_table_size[POWER_SUPPLY_OCV_TEMP_MAX];
 	struct power_supply_resistance_temp_table *resist_table;
-- 
cgit v1.2.3


From 6bb835f3d00467c9a5e35f4955afa29df96a404e Mon Sep 17 00:00:00 2001
From: Andriy Tryshnivskyy <andriy.tryshnivskyy@opensynergy.com>
Date: Sun, 24 Oct 2021 12:16:26 +0300
Subject: iio: core: Introduce IIO_VAL_INT_64.

Introduce IIO_VAL_INT_64 to read 64-bit value for
channel attribute. Val is used as lower 32 bits.

Signed-off-by: Andriy Tryshnivskyy <andriy.tryshnivskyy@opensynergy.com>
Link: https://lore.kernel.org/r/20211024091627.28031-2-andriy.tryshnivskyy@opensynergy.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 84b3f8175cc6..a7aa91f3a8dc 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -24,6 +24,7 @@ enum iio_event_info {
 #define IIO_VAL_INT_PLUS_NANO 3
 #define IIO_VAL_INT_PLUS_MICRO_DB 4
 #define IIO_VAL_INT_MULTIPLE 5
+#define IIO_VAL_INT_64 6 /* 64-bit data, val is lower 32 bits */
 #define IIO_VAL_FRACTIONAL 10
 #define IIO_VAL_FRACTIONAL_LOG2 11
 #define IIO_VAL_CHAR 12
-- 
cgit v1.2.3


From 2925748eadc33cba3bded7b69475a1b002b124ac Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 17 Nov 2021 13:22:53 +0000
Subject: firmware: cs_dsp: Add version checks on coefficient loading

The firmware coefficient files contain version information that is
currently ignored by the cs_dsp code. This information specifies which
version of the firmware the coefficient were generated for. Add a check
into the code which prints a warning in the case the coefficient and
firmware differ in version, in many cases this will be ok but it is not
always, so best to let the user know there is a potential issue.

Co-authored-by: Simon Trimmer <simont@opensource.cirrus.com>
Signed-off-by: Simon Trimmer <simont@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20211117132300.1290-3-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/firmware/cirrus/cs_dsp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index 3a54b1afc48f..ce54705e2bec 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -54,12 +54,14 @@ struct cs_dsp_region {
  * struct cs_dsp_alg_region - Describes a logical algorithm region in DSP address space
  * @list:	List node for internal use
  * @alg:	Algorithm id
+ * @ver:	Expected algorithm version
  * @type:	Memory region type
  * @base:	Address of region
  */
 struct cs_dsp_alg_region {
 	struct list_head list;
 	unsigned int alg;
+	unsigned int ver;
 	int type;
 	unsigned int base;
 };
-- 
cgit v1.2.3


From 14055b5a3a23204c4702ae5d3f2a819ee081ce33 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 17 Nov 2021 13:22:54 +0000
Subject: firmware: cs_dsp: Add pre_run callback

The code already has a post_run callback, add a matching pre_run
callback to the client_ops that is called before execution is started.
This callback provides a convenient place for the client code to
set DSP controls or hardware that requires configuration before
the DSP core actually starts execution. Note that placing this callback
before cs_dsp_coeff_sync_controls is important to ensure that any
control values are then correctly synced out to the chip.

Co-authored-by: Simon Trimmer <simont@opensource.cirrus.com>
Signed-off-by: Simon Trimmer <simont@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20211117132300.1290-4-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/firmware/cirrus/cs_dsp.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index ce54705e2bec..0bf849baeaa5 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -187,7 +187,8 @@ struct cs_dsp {
  * struct cs_dsp_client_ops - client callbacks
  * @control_add:	Called under the pwr_lock when a control is created
  * @control_remove:	Called under the pwr_lock when a control is destroyed
- * @post_run:		Called under the pwr_lock by cs_dsp_run()
+ * @pre_run:		Called under the pwr_lock by cs_dsp_run() before the core is started
+ * @post_run:		Called under the pwr_lock by cs_dsp_run() after the core is started
  * @post_stop:		Called under the pwr_lock by cs_dsp_stop()
  * @watchdog_expired:	Called when a watchdog expiry is detected
  *
@@ -197,6 +198,7 @@ struct cs_dsp {
 struct cs_dsp_client_ops {
 	int (*control_add)(struct cs_dsp_coeff_ctl *ctl);
 	void (*control_remove)(struct cs_dsp_coeff_ctl *ctl);
+	int (*pre_run)(struct cs_dsp *dsp);
 	int (*post_run)(struct cs_dsp *dsp);
 	void (*post_stop)(struct cs_dsp *dsp);
 	void (*watchdog_expired)(struct cs_dsp *dsp);
-- 
cgit v1.2.3


From b329b3d39497a9fdb175d7e4fd77ae7170d5d26c Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 17 Nov 2021 13:22:58 +0000
Subject: firmware: cs_dsp: Clarify some kernel doc comments

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20211117132300.1290-8-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/firmware/cirrus/cs_dsp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index 0bf849baeaa5..1ad1b173417a 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -76,8 +76,8 @@ struct cs_dsp_alg_region {
  * @enabled:		Flag indicating whether control is enabled
  * @list:		List node for internal use
  * @cache:		Cached value of the control
- * @offset:		Offset of control within alg_region
- * @len:		Length of the cached value
+ * @offset:		Offset of control within alg_region in words
+ * @len:		Length of the cached value in bytes
  * @set:		Flag indicating the value has been written by the user
  * @flags:		Bitfield of WMFW_CTL_FLAG_ control flags defined in wmfw.h
  * @type:		One of the WMFW_CTL_TYPE_ control types defined in wmfw.h
-- 
cgit v1.2.3


From f444da38ac924748de696c393327a44c4b8d727e Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 17 Nov 2021 13:22:59 +0000
Subject: firmware: cs_dsp: Add offset to cs_dsp read/write

Provide a mechanism to access only part of a control through the cs_dsp
interface.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20211117132300.1290-9-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/firmware/cirrus/cs_dsp.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index 1ad1b173417a..38b4da3ddfe4 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -232,8 +232,10 @@ void cs_dsp_init_debugfs(struct cs_dsp *dsp, struct dentry *debugfs_root);
 void cs_dsp_cleanup_debugfs(struct cs_dsp *dsp);
 
 int cs_dsp_coeff_write_acked_control(struct cs_dsp_coeff_ctl *ctl, unsigned int event_id);
-int cs_dsp_coeff_write_ctrl(struct cs_dsp_coeff_ctl *ctl, const void *buf, size_t len);
-int cs_dsp_coeff_read_ctrl(struct cs_dsp_coeff_ctl *ctl, void *buf, size_t len);
+int cs_dsp_coeff_write_ctrl(struct cs_dsp_coeff_ctl *ctl, unsigned int off,
+			    const void *buf, size_t len);
+int cs_dsp_coeff_read_ctrl(struct cs_dsp_coeff_ctl *ctl, unsigned int off,
+			   void *buf, size_t len);
 struct cs_dsp_coeff_ctl *cs_dsp_get_ctl(struct cs_dsp *dsp, const char *name, int type,
 					unsigned int alg);
 
-- 
cgit v1.2.3


From 5c903f64ce97172d63f7591cfa9e37cba58867b2 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 17 Nov 2021 13:23:00 +0000
Subject: firmware: cs_dsp: Allow creation of event controls

Some firmwares contain controls intended to convey firmware state back
to the host. Whilst more infrastructure will probably be needed for
these in time, as a first step allow creation of the controls, so said
firmwares arn't completely rejected.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20211117132300.1290-10-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/firmware/cirrus/wmfw.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/cirrus/wmfw.h b/include/linux/firmware/cirrus/wmfw.h
index a19bf7c6fc8b..74e5a4f6c13a 100644
--- a/include/linux/firmware/cirrus/wmfw.h
+++ b/include/linux/firmware/cirrus/wmfw.h
@@ -29,6 +29,7 @@
 #define WMFW_CTL_TYPE_ACKED       0x1000 /* acked control */
 #define WMFW_CTL_TYPE_HOSTEVENT   0x1001 /* event control */
 #define WMFW_CTL_TYPE_HOST_BUFFER 0x1002 /* host buffer pointer */
+#define WMFW_CTL_TYPE_FWEVENT     0x1004 /* firmware event control */
 
 struct wmfw_header {
 	char magic[4];
-- 
cgit v1.2.3


From 357a18ad230f0867791b788d2b1d6f280f6f6e61 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 15 Nov 2021 16:50:27 +0000
Subject: KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache

In commit 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time /
preempted status") I removed the only user of these functions because
it was basically impossible to use them safely.

There are two stages to the GFN->PFN mapping; first through the KVM
memslots to a userspace HVA and then through the page tables to
translate that HVA to an underlying PFN. Invalidations of the former
were being handled correctly, but no attempt was made to use the MMU
notifiers to invalidate the cache when the HVA->GFN mapping changed.

As a prelude to reinventing the gfn_to_pfn_cache with more usable
semantics, rip it out entirely and untangle the implementation of
the unsafe kvm_vcpu_map()/kvm_vcpu_unmap() functions from it.

All current users of kvm_vcpu_map() also look broken right now, and
will be dealt with separately. They broadly fall into two classes:

* Those which map, access the data and immediately unmap. This is
  mostly gratuitous and could just as well use the existing user
  HVA, and could probably benefit from a gfn_to_hva_cache as they
  do so.

* Those which keep the mapping around for a longer time, perhaps
  even using the PFN directly from the guest. These will need to
  be converted to the new gfn_to_pfn_cache and then kvm_vcpu_map()
  can be removed too.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20211115165030.7422-8-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h  | 6 +-----
 include/linux/kvm_types.h | 7 -------
 2 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 60a35d9fe259..eb625af4fc5e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -866,7 +866,7 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_accessed(kvm_pfn_t pfn);
 
-void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
+void kvm_release_pfn(kvm_pfn_t pfn, bool dirty);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 			int len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
@@ -942,12 +942,8 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
-int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
-		struct gfn_to_pfn_cache *cache, bool atomic);
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
-int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
-		  struct gfn_to_pfn_cache *cache, bool dirty, bool atomic);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 2237abb93ccd..234eab059839 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -53,13 +53,6 @@ struct gfn_to_hva_cache {
 	struct kvm_memory_slot *memslot;
 };
 
-struct gfn_to_pfn_cache {
-	u64 generation;
-	gfn_t gfn;
-	kvm_pfn_t pfn;
-	bool dirty;
-};
-
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 /*
  * Memory caches are used to preallocate memory ahead of various MMU flows,
-- 
cgit v1.2.3


From f915b75bffb7257bd8d26376b8e1cc67771927f8 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Wed, 17 Nov 2021 15:56:52 +0800
Subject: page_pool: Revert "page_pool: disable dma mapping support..."

This reverts commit d00e60ee54b12de945b8493cf18c1ada9e422514.

As reported by Guillaume in [1]:
Enabling LPAE always enables CONFIG_ARCH_DMA_ADDR_T_64BIT
in 32-bit systems, which breaks the bootup proceess when a
ethernet driver is using page pool with PP_FLAG_DMA_MAP flag.
As we were hoping we had no active consumers for such system
when we removed the dma mapping support, and LPAE seems like
a common feature for 32 bits system, so revert it.

1. https://www.spinics.net/lists/netdev/msg779890.html

Fixes: d00e60ee54b1 ("page_pool: disable dma mapping support for 32-bit arch with 64-bit DMA")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Reported-by: "kernelci.org bot" <bot@kernelci.org>
Tested-by: "kernelci.org bot" <bot@kernelci.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mm_types.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bb8c6f5f19bc..c3a6e6209600 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -105,7 +105,18 @@ struct page {
 			struct page_pool *pp;
 			unsigned long _pp_mapping_pad;
 			unsigned long dma_addr;
-			atomic_long_t pp_frag_count;
+			union {
+				/**
+				 * dma_addr_upper: might require a 64-bit
+				 * value on 32-bit architectures.
+				 */
+				unsigned long dma_addr_upper;
+				/**
+				 * For frag page support, not supported in
+				 * 32-bit architectures with 64-bit DMA.
+				 */
+				atomic_long_t pp_frag_count;
+			};
 		};
 		struct {	/* slab, slob and slub */
 			union {
-- 
cgit v1.2.3


From df6160deb3debe6f964c16349f9431157ff67dda Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 17 Nov 2021 17:57:29 -0800
Subject: tcp: add missing htmldocs for skb->ll_node and sk->defer_list

Add missing entries to fix these "make htmldocs" warnings.

./include/linux/skbuff.h:953: warning: Function parameter or member 'll_node' not described in 'sk_buff'
./include/net/sock.h:540: warning: Function parameter or member 'defer_list' not described in 'sock'

Fixes: f35f821935d8 ("tcp: defer skb freeing after socket lock is released")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b8b806512e16..100fd604fbc9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -627,6 +627,7 @@ typedef unsigned char *sk_buff_data_t;
  *		for retransmit timer
  *	@rbnode: RB tree node, alternative to next/prev for netem/tcp
  *	@list: queue head
+ *	@ll_node: anchor in an llist (eg socket defer_list)
  *	@sk: Socket we are owned by
  *	@ip_defrag_offset: (aka @sk) alternate use of @sk, used in
  *		fragmentation management
-- 
cgit v1.2.3


From 6966df483d7b5b218aeb0e13e7e334a8fc3c1744 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Thu, 18 Nov 2021 13:49:51 +0200
Subject: regulator: Update protection IRQ helper docs

The documentation of IRQ notification helper had still references to
first RFC implementation which called BUG() while trying to protect the
hardware. Behaviour was improved as calling the BUG() was not a proper
solution. Current implementation attempts to call poweroff if handling
of potentially damaging error notification fails. Update the
documentation to reflect the actual behaviour.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/0c9cc4bcf20c3da66fd5a85c97ee4288e5727538.1637233864.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index bd7a73db2e66..54cf566616ae 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -499,7 +499,8 @@ struct regulator_irq_data {
  *		best to shut-down regulator(s) or reboot the SOC if error
  *		handling is repeatedly failing. If fatal_cnt is given the IRQ
  *		handling is aborted if it fails for fatal_cnt times and die()
- *		callback (if populated) or BUG() is called to try to prevent
+ *		callback (if populated) is called. If die() is not populated
+ *		poweroff for the system is attempted in order to prevent any
  *		further damage.
  * @reread_ms:	The time which is waited before attempting to re-read status
  *		at the worker if IC reading fails. Immediate re-read is done
@@ -516,11 +517,12 @@ struct regulator_irq_data {
  * @data:	Driver private data pointer which will be passed as such to
  *		the renable, map_event and die callbacks in regulator_irq_data.
  * @die:	Protection callback. If IC status reading or recovery actions
- *		fail fatal_cnt times this callback or BUG() is called. This
- *		callback should implement a final protection attempt like
- *		disabling the regulator. If protection succeeded this may
- *		return 0. If anything else is returned the core assumes final
- *		protection failed and calls BUG() as a last resort.
+ *		fail fatal_cnt times this callback is called or system is
+ *		powered off. This callback should implement a final protection
+ *		attempt like disabling the regulator. If protection succeeded
+ *		die() may return 0. If anything else is returned the core
+ *		assumes final protection failed and attempts to perform a
+ *		poweroff as a last resort.
  * @map_event:	Driver callback to map IRQ status into regulator devices with
  *		events / errors. NOTE: callback MUST initialize both the
  *		errors and notifs for all rdevs which it signals having
-- 
cgit v1.2.3


From 8b6e88555971eac384b89fb0bd6c72ee4e1e6a6a Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Thu, 18 Nov 2021 13:48:47 +0200
Subject: regulator: rohm-regulator: add helper for restricted voltage setting

Few ROHM PMICs have regulators where voltage setting can be done only
when regulator is disabled. Add helper for those PMICs.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/6f51871e9fea611d133b5dd2560f4a7ee1ede9cd.1637233864.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/rohm-generic.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
index 35b392a0d73a..35c5866f48b7 100644
--- a/include/linux/mfd/rohm-generic.h
+++ b/include/linux/mfd/rohm-generic.h
@@ -80,6 +80,8 @@ int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dvs,
 				  const struct regulator_desc *desc,
 				  struct regmap *regmap);
 
+int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev,
+					      unsigned int sel);
 #else
 static inline int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dvs,
 						struct device_node *np,
@@ -88,6 +90,11 @@ static inline int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dv
 {
 	return 0;
 }
+static int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev,
+						     unsigned int sel)
+{
+	return 0;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 92b1348277f8893671e5354adde64fe3cf462821 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Thu, 18 Nov 2021 13:49:30 +0200
Subject: regulator: Add units to limit documentation

The documentation for limits used at protection level setting
did not mention the units. Fix the units in documentation to
match values passed in from device-tree (uV, uA, Kelvin) to
avoid confusion.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/111114aca991e41e49a32f89b74e95285f07c1e3.1637233864.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index bd7a73db2e66..1cb8071fee34 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -101,11 +101,13 @@ enum regulator_detection_severity {
  *		is requested.
  * @set_over_voltage_protection: Support enabling of and setting limits for over
  *	voltage situation detection. Detection can be configured for same
- *	severities as over current protection.
+ *	severities as over current protection. Units of uV.
  * @set_under_voltage_protection: Support enabling of and setting limits for
- *	under situation detection.
+ *	under voltage situation detection. Detection can be configured for same
+ *	severities as over current protection. Units of uV.
  * @set_thermal_protection: Support enabling of and setting limits for over
- *	temperature situation detection.
+ *	temperature situation detection.Detection can be configured for same
+ *	severities as over current protection. Units of degree Kelvin.
  *
  * @set_active_discharge: Set active discharge enable/disable of regulators.
  *
-- 
cgit v1.2.3


From 418e0a3551bbef5b221705b0e5b8412cdc0afd39 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 5 Nov 2021 14:42:24 +0200
Subject: lib/string_helpers: Introduce kasprintf_strarray()

We have a few users already that basically want to have array of
sequential strings to be allocated and filled.

Provide a helper for them (basically adjusted version from gpio-mockup.c).

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/string_helpers.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 4ba39e1403b2..f67a94013c87 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -100,6 +100,7 @@ char *kstrdup_quotable(const char *src, gfp_t gfp);
 char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp);
 char *kstrdup_quotable_file(struct file *file, gfp_t gfp);
 
+char **kasprintf_strarray(gfp_t gfp, const char *prefix, size_t n);
 void kfree_strarray(char **array, size_t n);
 
 #endif
-- 
cgit v1.2.3


From acdb89b6c87a2d7b5c48a82756e6f5c6f599f60a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 5 Nov 2021 14:42:25 +0200
Subject: lib/string_helpers: Introduce managed variant of kasprintf_strarray()

Some of the users want to have easy way to allocate array of strings
that will be automatically cleaned when associated device is gone.

Introduce managed variant of kasprintf_strarray() for such use cases.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/string_helpers.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index f67a94013c87..7a22921c9db7 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -7,6 +7,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 
+struct device;
 struct file;
 struct task_struct;
 
@@ -103,4 +104,6 @@ char *kstrdup_quotable_file(struct file *file, gfp_t gfp);
 char **kasprintf_strarray(gfp_t gfp, const char *prefix, size_t n);
 void kfree_strarray(char **array, size_t n);
 
+char **devm_kasprintf_strarray(struct device *dev, const char *prefix, size_t n);
+
 #endif
-- 
cgit v1.2.3


From 57bdeef4716689d9b0e3571034d65cf420f6efcd Mon Sep 17 00:00:00 2001
From: Naveen Naidu <naveennaidu479@gmail.com>
Date: Thu, 18 Nov 2021 19:33:11 +0530
Subject: PCI: Add PCI_ERROR_RESPONSE and related definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A config or MMIO read from a PCI device that doesn't exist or doesn't
respond causes a PCI error. There's no real data to return to satisfy the
CPU read, so most hardware fabricates ~0 data.

Add a PCI_ERROR_RESPONSE definition for that and use it where appropriate
to make these checks consistent and easier to find.

Also add helper definitions PCI_SET_ERROR_RESPONSE() and
PCI_POSSIBLE_ERROR() to make the code more readable.

Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/55563bf4dfc5d3fdc96695373c659d099bf175b1.1637243717.git.naveennaidu479@gmail.com
Signed-off-by: Naveen Naidu <naveennaidu479@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Pali Rohár <pali@kernel.org>
---
 include/linux/pci.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 18a75c8e615c..0ce26850470e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -154,6 +154,15 @@ enum pci_interrupt_pin {
 /* The number of legacy PCI INTx interrupts */
 #define PCI_NUM_INTX	4
 
+/*
+ * Reading from a device that doesn't respond typically returns ~0.  A
+ * successful read from a device may also return ~0, so you need additional
+ * information to reliably identify errors.
+ */
+#define PCI_ERROR_RESPONSE		(~0ULL)
+#define PCI_SET_ERROR_RESPONSE(val)	(*(val) = ((typeof(*(val))) PCI_ERROR_RESPONSE))
+#define PCI_POSSIBLE_ERROR(val)		((val) == ((typeof(val)) PCI_ERROR_RESPONSE))
+
 /*
  * pci_power_t values must match the bits in the Capabilities PME_Support
  * and Control/Status PowerState fields in the Power Management capability.
-- 
cgit v1.2.3


From c035713998700e8843c7d087f55bce3c54c0e3ec Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 5 Nov 2021 10:19:05 -0400
Subject: mm: Add functions to zero portions of a folio

These functions are wrappers around zero_user_segments(), which means
that zero_user_segments() can now be called for compound pages even when
CONFIG_TRANSPARENT_HUGEPAGE is disabled.

Use 'xend' as the name of the parameter to indicate that this is an
excluded end, not the more usual included end.  Excluding the end makes
more sense to the callers, but can cause confusion to readers who are
more used to seeing included ends.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/highmem.h | 44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index c944b3b70ee7..39bb9b47fa9c 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -230,10 +230,10 @@ static inline void tag_clear_highpage(struct page *page)
  * If we pass in a base or tail page, we can zero up to PAGE_SIZE.
  * If we pass in a head page, we can zero up to the size of the compound page.
  */
-#if defined(CONFIG_HIGHMEM) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#ifdef CONFIG_HIGHMEM
 void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
 		unsigned start2, unsigned end2);
-#else /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */
+#else
 static inline void zero_user_segments(struct page *page,
 		unsigned start1, unsigned end1,
 		unsigned start2, unsigned end2)
@@ -253,7 +253,7 @@ static inline void zero_user_segments(struct page *page,
 	for (i = 0; i < compound_nr(page); i++)
 		flush_dcache_page(page + i);
 }
-#endif /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */
+#endif
 
 static inline void zero_user_segment(struct page *page,
 	unsigned start, unsigned end)
@@ -363,4 +363,42 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len)
 	kunmap_local(addr);
 }
 
+/**
+ * folio_zero_segments() - Zero two byte ranges in a folio.
+ * @folio: The folio to write to.
+ * @start1: The first byte to zero.
+ * @xend1: One more than the last byte in the first range.
+ * @start2: The first byte to zero in the second range.
+ * @xend2: One more than the last byte in the second range.
+ */
+static inline void folio_zero_segments(struct folio *folio,
+		size_t start1, size_t xend1, size_t start2, size_t xend2)
+{
+	zero_user_segments(&folio->page, start1, xend1, start2, xend2);
+}
+
+/**
+ * folio_zero_segment() - Zero a byte range in a folio.
+ * @folio: The folio to write to.
+ * @start: The first byte to zero.
+ * @xend: One more than the last byte to zero.
+ */
+static inline void folio_zero_segment(struct folio *folio,
+		size_t start, size_t xend)
+{
+	zero_user_segments(&folio->page, start, xend, 0, 0);
+}
+
+/**
+ * folio_zero_range() - Zero a byte range in a folio.
+ * @folio: The folio to write to.
+ * @start: The first byte to zero.
+ * @length: The number of bytes to zero.
+ */
+static inline void folio_zero_range(struct folio *folio,
+		size_t start, size_t length)
+{
+	zero_user_segments(&folio->page, start, start + length, 0, 0);
+}
+
 #endif /* _LINUX_HIGHMEM_H */
-- 
cgit v1.2.3


From c4f5b30dda01f2f6979a9681142de454991182ee Mon Sep 17 00:00:00 2001
From: Biju Das <biju.das.jz@bp.renesas.com>
Date: Fri, 12 Nov 2021 18:44:10 +0000
Subject: reset: Add of_reset_control_get_optional_exclusive()

Add optional variant of of_reset_control_get_exclusive(). If the
requested reset is not specified in the device tree, this function
returns NULL instead of an error.

Suggested-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20211112184413.4391-2-biju.das.jz@bp.renesas.com
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 include/linux/reset.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reset.h b/include/linux/reset.h
index db0e6115a2f6..8a21b5756c3e 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -454,6 +454,26 @@ static inline struct reset_control *of_reset_control_get_exclusive(
 	return __of_reset_control_get(node, id, 0, false, false, true);
 }
 
+/**
+ * of_reset_control_get_optional_exclusive - Lookup and obtain an optional exclusive
+ *                                           reference to a reset controller.
+ * @node: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Optional variant of of_reset_control_get_exclusive(). If the requested reset
+ * is not specified in the device tree, this function returns NULL instead of
+ * an error.
+ *
+ * Returns a struct reset_control or IS_ERR() condition containing errno.
+ *
+ * Use of id names is optional.
+ */
+static inline struct reset_control *of_reset_control_get_optional_exclusive(
+				struct device_node *node, const char *id)
+{
+	return __of_reset_control_get(node, id, 0, false, true, true);
+}
+
 /**
  * of_reset_control_get_shared - Lookup and obtain a shared reference
  *                               to a reset controller.
-- 
cgit v1.2.3


From fcb116bc43c8c37c052530ead79872f8b2615711 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 18 Nov 2021 14:23:21 -0600
Subject: signal: Replace force_fatal_sig with force_exit_sig when in doubt

Recently to prevent issues with SECCOMP_RET_KILL and similar signals
being changed before they are delivered SA_IMMUTABLE was added.

Unfortunately this broke debuggers[1][2] which reasonably expect
to be able to trap synchronous SIGTRAP and SIGSEGV even when
the target process is not configured to handle those signals.

Add force_exit_sig and use it instead of force_fatal_sig where
historically the code has directly called do_exit.  This has the
implementation benefits of going through the signal exit path
(including generating core dumps) without the danger of allowing
userspace to ignore or change these signals.

This avoids userspace regressions as older kernels exited with do_exit
which debuggers also can not intercept.

In the future is should be possible to improve the quality of
implementation of the kernel by changing some of these force_exit_sig
calls to force_fatal_sig.  That can be done where it matters on
a case-by-case basis with careful analysis.

Reported-by: Kyle Huey <me@kylehuey.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
[1] https://lkml.kernel.org/r/CAP045AoMY4xf8aC_4QU_-j7obuEPYgTcnQQP3Yxk=2X90jtpjw@mail.gmail.com
[2] https://lkml.kernel.org/r/20211117150258.GB5403@xsang-OptiPlex-9020
Fixes: 00b06da29cf9 ("signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed")
Fixes: a3616a3c0272 ("signal/m68k: Use force_sigsegv(SIGSEGV) in fpsp040_die")
Fixes: 83a1f27ad773 ("signal/powerpc: On swapcontext failure force SIGSEGV")
Fixes: 9bc508cf0791 ("signal/s390: Use force_sigsegv in default_trap_handler")
Fixes: 086ec444f866 ("signal/sparc32: In setup_rt_frame and setup_fram use force_fatal_sig")
Fixes: c317d306d550 ("signal/sparc32: Exit with a fatal signal when try_to_clear_window_buffer fails")
Fixes: 695dd0d634df ("signal/x86: In emulate_vsyscall force a signal instead of calling do_exit")
Fixes: 1fbd60df8a85 ("signal/vm86_32: Properly send SIGSEGV when the vm86 state cannot be saved.")
Fixes: 941edc5bf174 ("exit/syscall_user_dispatch: Send ordinary signals on failure")
Link: https://lkml.kernel.org/r/871r3dqfv8.fsf_-_@email.froward.int.ebiederm.org
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Kyle Huey <khuey@kylehuey.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/signal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 23505394ef70..33a50642cf41 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -352,6 +352,7 @@ extern __must_check bool do_notify_parent(struct task_struct *, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int);
 extern void force_fatal_sig(int);
+extern void force_exit_sig(int);
 extern int send_sig(int, struct task_struct *, int);
 extern int zap_other_threads(struct task_struct *p);
 extern struct sigqueue *sigqueue_alloc(void);
-- 
cgit v1.2.3


From d8466f73010faf71effb21228ae1cbf577dab130 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Sat, 16 Oct 2021 14:22:27 +0100
Subject: mtd: rawnand: Export nand_read_page_hwecc_oob_first()

Move the function nand_read_page_hwecc_oob_first() (previously
nand_davinci_read_page_hwecc_oob_first()) to nand_base.c, and export it
as a GPL symbol, so that it can be used by more modules.

Cc: <stable@vger.kernel.org> # v5.2
Fixes: a0ac778eb82c ("mtd: rawnand: ingenic: Add support for the JZ4740")
Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20211016132228.40254-4-paul@crapouillou.net
---
 include/linux/mtd/rawnand.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index b2f9dd3cbd69..5b88cd51fadb 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1539,6 +1539,8 @@ int nand_read_data_op(struct nand_chip *chip, void *buf, unsigned int len,
 		      bool force_8bit, bool check_only);
 int nand_write_data_op(struct nand_chip *chip, const void *buf,
 		       unsigned int len, bool force_8bit);
+int nand_read_page_hwecc_oob_first(struct nand_chip *chip, uint8_t *buf,
+				   int oob_required, int page);
 
 /* Scan and identify a NAND device */
 int nand_scan_with_ids(struct nand_chip *chip, unsigned int max_chips,
-- 
cgit v1.2.3


From adeef3e32146a8d2a73c399dc6f5d76a449131b1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 19 Nov 2021 06:21:51 -0800
Subject: net: constify netdev->dev_addr

Commit 406f42fa0d3c ("net-next: When a bond have a massive amount
of VLANs...") introduced a rbtree for faster Ethernet address look
up. We converted all users to make modifications via appropriate
helpers, make netdev->dev_addr const.

The update helpers need to upcast from the buffer to
struct netdev_hw_addr.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4f4a299e92de..2462195784a9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2117,7 +2117,7 @@ struct net_device {
  * Cache lines mostly used on receive path (including eth_type_trans())
  */
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		*dev_addr;
+	const unsigned char	*dev_addr;
 
 	struct netdev_rx_queue	*_rx;
 	unsigned int		num_rx_queues;
@@ -4268,10 +4268,13 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
 void __hw_addr_init(struct netdev_hw_addr_list *list);
 
 /* Functions used for device addresses handling */
+void dev_addr_mod(struct net_device *dev, unsigned int offset,
+		  const void *addr, size_t len);
+
 static inline void
 __dev_addr_set(struct net_device *dev, const void *addr, size_t len)
 {
-	memcpy(dev->dev_addr, addr, len);
+	dev_addr_mod(dev, 0, addr, len);
 }
 
 static inline void dev_addr_set(struct net_device *dev, const u8 *addr)
@@ -4279,13 +4282,6 @@ static inline void dev_addr_set(struct net_device *dev, const u8 *addr)
 	__dev_addr_set(dev, addr, dev->addr_len);
 }
 
-static inline void
-dev_addr_mod(struct net_device *dev, unsigned int offset,
-	     const void *addr, size_t len)
-{
-	memcpy(&dev->dev_addr[offset], addr, len);
-}
-
 int dev_addr_add(struct net_device *dev, const unsigned char *addr,
 		 unsigned char addr_type);
 int dev_addr_del(struct net_device *dev, const unsigned char *addr,
-- 
cgit v1.2.3


From d07b26f5bbea9ade34dfd6abea7b3ca056c03cd1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 19 Nov 2021 06:21:53 -0800
Subject: dev_addr: add a modification check

netdev->dev_addr should only be modified via helpers,
but someone may be casting off the const. Add a runtime
check to catch abuses.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2462195784a9..cb7f2661d187 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1942,6 +1942,8 @@ enum netdev_ml_priv_type {
  *	@unlink_list:	As netif_addr_lock() can be called recursively,
  *			keep a list of interfaces to be deleted.
  *
+ *	@dev_addr_shadow:	Copy of @dev_addr to catch direct writes.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2268,6 +2270,8 @@ struct net_device {
 
 	/* protected by rtnl_lock */
 	struct bpf_xdp_entity	xdp_state[__MAX_XDP_MODE];
+
+	u8 dev_addr_shadow[MAX_ADDR_LEN];
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -4288,6 +4292,7 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr,
 		 unsigned char addr_type);
 void dev_addr_flush(struct net_device *dev);
 int dev_addr_init(struct net_device *dev);
+void dev_addr_check(struct net_device *dev);
 
 /* Functions used for unicast addresses handling */
 int dev_uc_add(struct net_device *dev, const unsigned char *addr);
-- 
cgit v1.2.3


From 85b6d24646e4125c591639841169baa98a2da503 Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Date: Fri, 19 Nov 2021 16:43:21 -0800
Subject: shm: extend forced shm destroy to support objects from several IPC
 nses

Currently, the exit_shm() function not designed to work properly when
task->sysvshm.shm_clist holds shm objects from different IPC namespaces.

This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it
leads to use-after-free (reproducer exists).

This is an attempt to fix the problem by extending exit_shm mechanism to
handle shm's destroy from several IPC ns'es.

To achieve that we do several things:

1. add a namespace (non-refcounted) pointer to the struct shmid_kernel

2. during new shm object creation (newseg()/shmget syscall) we
   initialize this pointer by current task IPC ns

3. exit_shm() fully reworked such that it traverses over all shp's in
   task->sysvshm.shm_clist and gets IPC namespace not from current task
   as it was before but from shp's object itself, then call
   shm_destroy(shp, ns).

Note: We need to be really careful here, because as it was said before
(1), our pointer to IPC ns non-refcnt'ed.  To be on the safe side we
using special helper get_ipc_ns_not_zero() which allows to get IPC ns
refcounter only if IPC ns not in the "state of destruction".

Q/A

Q: Why can we access shp->ns memory using non-refcounted pointer?
A: Because shp object lifetime is always shorther than IPC namespace
   lifetime, so, if we get shp object from the task->sysvshm.shm_clist
   while holding task_lock(task) nobody can steal our namespace.

Q: Does this patch change semantics of unshare/setns/clone syscalls?
A: No. It's just fixes non-covered case when process may leave IPC
   namespace without getting task->sysvshm.shm_clist list cleaned up.

Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com
Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com
Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity")
Co-developed-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Cc: Vasily Averin <vvs@virtuozzo.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h | 15 +++++++++++++++
 include/linux/sched/task.h    |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 05e22770af51..b75395ec8d52 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -131,6 +131,16 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 	return ns;
 }
 
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+	if (ns) {
+		if (refcount_inc_not_zero(&ns->ns.count))
+			return ns;
+	}
+
+	return NULL;
+}
+
 extern void put_ipc_ns(struct ipc_namespace *ns);
 #else
 static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
@@ -147,6 +157,11 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 	return ns;
 }
 
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+	return ns;
+}
+
 static inline void put_ipc_ns(struct ipc_namespace *ns)
 {
 }
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ba88a6987400..058d7f371e25 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -158,7 +158,7 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
  * pins the final release of task.io_context.  Also protects ->cpuset and
- * ->cgroup.subsys[]. And ->vfork_done.
+ * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
  *
  * Nests both inside and outside of read_lock(&tasklist_lock).
  * It must not be nested with write_lock_irq(&tasklist_lock),
-- 
cgit v1.2.3


From afe041c2d0febd83698b8b0164e6b3b1dfae0b66 Mon Sep 17 00:00:00 2001
From: Bui Quang Minh <minhquangbui99@gmail.com>
Date: Fri, 19 Nov 2021 16:43:40 -0800
Subject: hugetlb: fix hugetlb cgroup refcounting during mremap

When hugetlb_vm_op_open() is called during copy_vma(), we may take the
reference to resv_map->css.  Later, when clearing the reservation
pointer of old_vma after transferring it to new_vma, we forget to drop
the reference to resv_map->css.  This leads to a reference leak of css.

Fixes this by adding a check to drop reservation css reference in
clear_vma_resv_huge_pages()

Link: https://lkml.kernel.org/r/20211113154412.91134-1-minhquangbui99@gmail.com
Fixes: 550a7d60bd5e35 ("mm, hugepages: add mremap() support for hugepage backed vma")
Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb_cgroup.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index c137396129db..ba025ae27882 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -128,6 +128,13 @@ static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
 		css_get(resv_map->css);
 }
 
+static inline void resv_map_put_hugetlb_cgroup_uncharge_info(
+						struct resv_map *resv_map)
+{
+	if (resv_map->css)
+		css_put(resv_map->css);
+}
+
 extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 					struct hugetlb_cgroup **ptr);
 extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
@@ -211,6 +218,11 @@ static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
 {
 }
 
+static inline void resv_map_put_hugetlb_cgroup_uncharge_info(
+						struct resv_map *resv_map)
+{
+}
+
 static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 					       struct hugetlb_cgroup **ptr)
 {
-- 
cgit v1.2.3


From f65b8132092699e4f672111836f3f51c00c354f2 Mon Sep 17 00:00:00 2001
From: Elyes HAOUAS <ehaouas@noos.fr>
Date: Thu, 28 Oct 2021 23:05:17 +0200
Subject: include/linux/efi.h: Remove unneeded whitespaces before tabs

Signed-off-by: Elyes HAOUAS <ehaouas@noos.fr>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index dbd39b20e034..de36fb547602 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -570,8 +570,8 @@ extern struct efi {
 	unsigned long			flags;
 } efi;
 
-#define EFI_RT_SUPPORTED_GET_TIME 				0x0001
-#define EFI_RT_SUPPORTED_SET_TIME 				0x0002
+#define EFI_RT_SUPPORTED_GET_TIME				0x0001
+#define EFI_RT_SUPPORTED_SET_TIME				0x0002
 #define EFI_RT_SUPPORTED_GET_WAKEUP_TIME			0x0004
 #define EFI_RT_SUPPORTED_SET_WAKEUP_TIME			0x0008
 #define EFI_RT_SUPPORTED_GET_VARIABLE				0x0010
@@ -838,7 +838,7 @@ extern int efi_status_to_err(efi_status_t status);
 #define EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS 0x0000000000000020
 #define EFI_VARIABLE_APPEND_WRITE	0x0000000000000040
 
-#define EFI_VARIABLE_MASK 	(EFI_VARIABLE_NON_VOLATILE | \
+#define EFI_VARIABLE_MASK	(EFI_VARIABLE_NON_VOLATILE | \
 				EFI_VARIABLE_BOOTSERVICE_ACCESS | \
 				EFI_VARIABLE_RUNTIME_ACCESS | \
 				EFI_VARIABLE_HARDWARE_ERROR_RECORD | \
-- 
cgit v1.2.3


From 3218910fd5858842a1dd98ce92b602f0878f8210 Mon Sep 17 00:00:00 2001
From: Adrian Larumbe <adrianml@alumnos.upm.es>
Date: Mon, 1 Nov 2021 18:08:24 +0000
Subject: dmaengine: Add core function and capability check for DMA_MEMCPY_SG

This is the old DMA_SG interface that was removed in commit
c678fa66341c ("dmaengine: remove DMA_SG as it is dead code in kernel"). It
has been renamed to DMA_MEMCPY_SG to better match the MEMSET and MEMSET_SG
naming convention.

It should only be used for mem2mem copies, either main system memory or
CPU-addressable device memory (like video memory on a PCI graphics card).

Bringing back this interface was prompted by the need to use the Xilinx
CDMA device for mem2mem SG transfers.

Signed-off-by: Adrian Larumbe <adrianml@alumnos.upm.es>
Link: https://lore.kernel.org/r/20211101180825.241048-3-adrianml@alumnos.upm.es
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 9000f3ffce8b..554a86665de9 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -50,6 +50,7 @@ enum dma_status {
  */
 enum dma_transaction_type {
 	DMA_MEMCPY,
+	DMA_MEMCPY_SG,
 	DMA_XOR,
 	DMA_PQ,
 	DMA_XOR_VAL,
@@ -891,6 +892,11 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
 		struct dma_chan *chan, dma_addr_t dst, dma_addr_t src,
 		size_t len, unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy_sg)(
+		struct dma_chan *chan,
+		struct scatterlist *dst_sg, unsigned int dst_nents,
+		struct scatterlist *src_sg, unsigned int src_nents,
+		unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
 		struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
 		unsigned int src_cnt, size_t len, unsigned long flags);
@@ -1051,6 +1057,20 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy(
 						    len, flags);
 }
 
+static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy_sg(
+		struct dma_chan *chan,
+		struct scatterlist *dst_sg, unsigned int dst_nents,
+		struct scatterlist *src_sg, unsigned int src_nents,
+		unsigned long flags)
+{
+	if (!chan || !chan->device || !chan->device->device_prep_dma_memcpy_sg)
+		return NULL;
+
+	return chan->device->device_prep_dma_memcpy_sg(chan, dst_sg, dst_nents,
+						       src_sg, src_nents,
+						       flags);
+}
+
 static inline bool dmaengine_is_metadata_mode_supported(struct dma_chan *chan,
 		enum dma_desc_metadata_mode mode)
 {
-- 
cgit v1.2.3


From 0b70c256eba8448b072d25c95ee65e59da8970de Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao288@hisilicon.com>
Date: Thu, 18 Nov 2021 20:12:42 +0800
Subject: ethtool: add support to set/get rx buf len via ethtool

Add support to set rx buf len via ethtool -G parameter and get
rx buf len via ethtool -g parameter.

Signed-off-by: Hao Chen <chenhao288@hisilicon.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 845a0ffc16ee..0b252b82988b 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -67,6 +67,22 @@ enum {
 	ETH_RSS_HASH_FUNCS_COUNT
 };
 
+/**
+ * struct kernel_ethtool_ringparam - RX/TX ring configuration
+ * @rx_buf_len: Current length of buffers on the rx ring.
+ */
+struct kernel_ethtool_ringparam {
+	u32	rx_buf_len;
+};
+
+/**
+ * enum ethtool_supported_ring_param - indicator caps for setting ring params
+ * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len
+ */
+enum ethtool_supported_ring_param {
+	ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0),
+};
+
 #define __ETH_RSS_HASH_BIT(bit)	((u32)1 << (bit))
 #define __ETH_RSS_HASH(name)	__ETH_RSS_HASH_BIT(ETH_RSS_HASH_##name##_BIT)
 
@@ -432,6 +448,7 @@ struct ethtool_module_power_mode_params {
  * @cap_link_lanes_supported: indicates if the driver supports lanes
  *	parameter.
  * @supported_coalesce_params: supported types of interrupt coalescing.
+ * @supported_ring_params: supported ring params.
  * @get_drvinfo: Report driver/device information.  Should only set the
  *	@driver, @version, @fw_version and @bus_info fields.  If not
  *	implemented, the @driver and @bus_info fields will be filled in
@@ -613,6 +630,7 @@ struct ethtool_module_power_mode_params {
 struct ethtool_ops {
 	u32     cap_link_lanes_supported:1;
 	u32	supported_coalesce_params;
+	u32	supported_ring_params;
 	void	(*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
 	int	(*get_regs_len)(struct net_device *);
 	void	(*get_regs)(struct net_device *, struct ethtool_regs *, void *);
-- 
cgit v1.2.3


From 7462494408cd3de8b0bc1e79670bf213288501d0 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao288@hisilicon.com>
Date: Thu, 18 Nov 2021 20:12:43 +0800
Subject: ethtool: extend ringparam setting/getting API with rx_buf_len

Add two new parameters kernel_ringparam and extack for
.get_ringparam and .set_ringparam to extend more ring params
through netlink.

Signed-off-by: Hao Chen <chenhao288@hisilicon.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 0b252b82988b..a26f37a27167 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -656,9 +656,13 @@ struct ethtool_ops {
 				struct kernel_ethtool_coalesce *,
 				struct netlink_ext_ack *);
 	void	(*get_ringparam)(struct net_device *,
-				 struct ethtool_ringparam *);
+				 struct ethtool_ringparam *,
+				 struct kernel_ethtool_ringparam *,
+				 struct netlink_ext_ack *);
 	int	(*set_ringparam)(struct net_device *,
-				 struct ethtool_ringparam *);
+				 struct ethtool_ringparam *,
+				 struct kernel_ethtool_ringparam *,
+				 struct netlink_ext_ack *);
 	void	(*get_pause_stats)(struct net_device *dev,
 				   struct ethtool_pause_stats *pause_stats);
 	void	(*get_pauseparam)(struct net_device *,
-- 
cgit v1.2.3


From 4b66d2161b8125b6caa6971815e85631cf3cf36f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 19 Nov 2021 07:43:31 -0800
Subject: net: annotate accesses to dev->gso_max_size

dev->gso_max_size is written under RTNL protection, or when the device is
not yet visible, but is read locklessly.

Add the READ_ONCE()/WRITE_ONCE() pairs, and use netif_set_gso_max_size()
where we can to better document what is going on.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb7f2661d187..14eeb58ed197 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4731,7 +4731,8 @@ static inline bool netif_needs_gso(struct sk_buff *skb,
 static inline void netif_set_gso_max_size(struct net_device *dev,
 					  unsigned int size)
 {
-	dev->gso_max_size = size;
+	/* dev->gso_max_size is read locklessly from sk_setup_caps() */
+	WRITE_ONCE(dev->gso_max_size, size);
 }
 
 static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
-- 
cgit v1.2.3


From 6d872df3e3b91532b142de9044e5b4984017a55f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 19 Nov 2021 07:43:32 -0800
Subject: net: annotate accesses to dev->gso_max_segs

dev->gso_max_segs is written under RTNL protection, or when the device is
not yet visible, but is read locklessly.

Add netif_set_gso_max_segs() helper.

Add the READ_ONCE()/WRITE_ONCE() pairs, and use netif_set_gso_max_segs()
where we can to better document what is going on.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 14eeb58ed197..df049864661d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4735,6 +4735,13 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	WRITE_ONCE(dev->gso_max_size, size);
 }
 
+static inline void netif_set_gso_max_segs(struct net_device *dev,
+					  unsigned int segs)
+{
+	/* dev->gso_max_segs is read locklessly from sk_setup_caps() */
+	WRITE_ONCE(dev->gso_max_segs, segs);
+}
+
 static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
 					int pulled_hlen, u16 mac_offset,
 					int mac_len)
-- 
cgit v1.2.3


From 291dcae39bc482f7edc91a50d97027327e0cf5f9 Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@seco.com>
Date: Fri, 19 Nov 2021 10:58:09 -0500
Subject: net: phylink: Add helpers for c22 registers without MDIO

Some devices expose memory-mapped c22-compliant PHYs. Because these
devices do not have an MDIO bus, we cannot use the existing helpers.
Refactor the existing helpers to allow supplying the values for c22
registers directly, instead of using MDIO to access them. Only get_state
and set_advertisement are converted, since they contain the most complex
logic. Because set_advertisement is never actually used outside
phylink_mii_c22_pcs_config, move the MDIO-writing part into that
function. Because some modes do not need the advertisement register set
at all, we use -EINVAL for this purpose.

Additionally, a new function phylink_pcs_enable_an is provided to
determine whether to enable autonegotiation.

Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 3563820a1765..01224235df0f 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -527,11 +527,12 @@ void phylink_set_port_modes(unsigned long *bits);
 void phylink_set_10g_modes(unsigned long *mask);
 void phylink_helper_basex_speed(struct phylink_link_state *state);
 
+void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state,
+				      u16 bmsr, u16 lpa);
 void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs,
 				   struct phylink_link_state *state);
-int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs,
-					  phy_interface_t interface,
-					  const unsigned long *advertising);
+int phylink_mii_c22_pcs_encode_advertisement(phy_interface_t interface,
+					     const unsigned long *advertising);
 int phylink_mii_c22_pcs_config(struct mdio_device *pcs, unsigned int mode,
 			       phy_interface_t interface,
 			       const unsigned long *advertising);
-- 
cgit v1.2.3


From c4804670026b93f4ebddda30af89fd737bf93931 Mon Sep 17 00:00:00 2001
From: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Date: Sat, 20 Nov 2021 21:51:54 +0530
Subject: net: wwan: common debugfs base dir for wwan device

This patch set brings in a common debugfs base directory
i.e. /sys/kernel/debugfs/wwan/ in WWAN Subsystem for a
WWAN device instance. So that it avoids driver polluting
debugfs root with unrelated directories & possible name
collusion.

Having a common debugfs base directory for WWAN drivers
eases user to match control devices with debugfs entries.

WWAN Subsystem creates dentry (/sys/kernel/debugfs/wwan)
on module load & removes dentry on module unload.

When driver registers a new wwan device, dentry (wwanX)
is created for WWAN device instance & on driver unregister
dentry is removed.

New API is introduced to return the wwan device instance
dentry so that driver can create debugfs entries under it.

Signed-off-by: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Reviewed-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/wwan.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index 9fac819f92e3..1646aa3e6779 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -171,4 +171,6 @@ int wwan_register_ops(struct device *parent, const struct wwan_ops *ops,
 
 void wwan_unregister_ops(struct device *parent);
 
+struct dentry *wwan_get_debugfs_dir(struct device *parent);
+
 #endif /* __WWAN_H */
-- 
cgit v1.2.3


From fba84957e2e2e201cf4e352efe0c7cac0fbb5d5d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sat, 20 Nov 2021 16:31:48 -0800
Subject: skbuff: Move conditional preprocessor directives out of struct
 sk_buff

In preparation for using the struct_group() macro in struct sk_buff,
move the conditional preprocessor directives out of the region of struct
sk_buff that will be enclosed by struct_group(). While GCC and Clang are
happy with conditional preprocessor directives here, sparse is not, even
under -Wno-directive-within-macro[1], as would be seen under a C=1 build:

net/core/filter.c: note: in included file (through include/linux/netlink.h, include/linux/sock_diag.h):
./include/linux/skbuff.h:820:1: warning: directive in macro's argument list
./include/linux/skbuff.h:822:1: warning: directive in macro's argument list
./include/linux/skbuff.h:846:1: warning: directive in macro's argument list
./include/linux/skbuff.h:848:1: warning: directive in macro's argument list

Additionally remove empty macro argument definitions and usage.

"objdump -d" shows no object code differences.

[1] https://www.spinics.net/lists/linux-sparse/msg10857.html

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 059b6266dcd7..c7889afe0d0d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -795,7 +795,7 @@ struct sk_buff {
 #else
 #define CLONED_MASK	1
 #endif
-#define CLONED_OFFSET()		offsetof(struct sk_buff, __cloned_offset)
+#define CLONED_OFFSET		offsetof(struct sk_buff, __cloned_offset)
 
 	/* private: */
 	__u8			__cloned_offset[0];
@@ -818,18 +818,10 @@ struct sk_buff {
 	__u32			headers_start[0];
 	/* public: */
 
-/* if you move pkt_type around you also must adapt those constants */
-#ifdef __BIG_ENDIAN_BITFIELD
-#define PKT_TYPE_MAX	(7 << 5)
-#else
-#define PKT_TYPE_MAX	7
-#endif
-#define PKT_TYPE_OFFSET()	offsetof(struct sk_buff, __pkt_type_offset)
-
 	/* private: */
 	__u8			__pkt_type_offset[0];
 	/* public: */
-	__u8			pkt_type:3;
+	__u8			pkt_type:3; /* see PKT_TYPE_MAX */
 	__u8			ignore_df:1;
 	__u8			nf_trace:1;
 	__u8			ip_summed:2;
@@ -845,16 +837,10 @@ struct sk_buff {
 	__u8			encap_hdr_csum:1;
 	__u8			csum_valid:1;
 
-#ifdef __BIG_ENDIAN_BITFIELD
-#define PKT_VLAN_PRESENT_BIT	7
-#else
-#define PKT_VLAN_PRESENT_BIT	0
-#endif
-#define PKT_VLAN_PRESENT_OFFSET()	offsetof(struct sk_buff, __pkt_vlan_present_offset)
 	/* private: */
 	__u8			__pkt_vlan_present_offset[0];
 	/* public: */
-	__u8			vlan_present:1;
+	__u8			vlan_present:1;	/* See PKT_VLAN_PRESENT_BIT */
 	__u8			csum_complete_sw:1;
 	__u8			csum_level:2;
 	__u8			csum_not_inet:1;
@@ -953,6 +939,22 @@ struct sk_buff {
 #endif
 };
 
+/* if you move pkt_type around you also must adapt those constants */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define PKT_TYPE_MAX	(7 << 5)
+#else
+#define PKT_TYPE_MAX	7
+#endif
+#define PKT_TYPE_OFFSET		offsetof(struct sk_buff, __pkt_type_offset)
+
+/* if you move pkt_vlan_present around you also must adapt these constants */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define PKT_VLAN_PRESENT_BIT	7
+#else
+#define PKT_VLAN_PRESENT_BIT	0
+#endif
+#define PKT_VLAN_PRESENT_OFFSET	offsetof(struct sk_buff, __pkt_vlan_present_offset)
+
 #ifdef __KERNEL__
 /*
  *	Handling routines are only of interest to the kernel
-- 
cgit v1.2.3


From 03f61041c17914355dde7261be9ccdc821ddd454 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sat, 20 Nov 2021 16:31:49 -0800
Subject: skbuff: Switch structure bounds to struct_group()

In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memcpy(), memmove(), and memset(), avoid
intentionally writing across neighboring fields.

Replace the existing empty member position markers "headers_start" and
"headers_end" with a struct_group(). This will allow memcpy() and sizeof()
to more easily reason about sizes, and improve readability.

"pahole" shows no size nor member offset changes to struct sk_buff.
"objdump -d" shows no object code changes (outside of WARNs affected by
source line number changes).

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com> # drivers/net/wireguard/*
Link: https://lore.kernel.org/lkml/20210728035006.GD35706@embeddedor
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c7889afe0d0d..eba256af64a5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -811,12 +811,10 @@ struct sk_buff {
 	__u8			active_extensions;
 #endif
 
-	/* fields enclosed in headers_start/headers_end are copied
+	/* Fields enclosed in headers group are copied
 	 * using a single memcpy() in __copy_skb_header()
 	 */
-	/* private: */
-	__u32			headers_start[0];
-	/* public: */
+	struct_group(headers,
 
 	/* private: */
 	__u8			__pkt_type_offset[0];
@@ -921,9 +919,7 @@ struct sk_buff {
 	u64			kcov_handle;
 #endif
 
-	/* private: */
-	__u32			headers_end[0];
-	/* public: */
+	); /* end headers group */
 
 	/* These elements must be at the end, see alloc_skb() for details.  */
 	sk_buff_data_t		tail;
-- 
cgit v1.2.3


From e523af4ee56090fbdd9cf474752448d35930bcd4 Mon Sep 17 00:00:00 2001
From: Shiraz Saleem <shiraz.saleem@intel.com>
Date: Mon, 18 Oct 2021 18:16:02 -0500
Subject: net/ice: Add support for enable_iwarp and enable_roce devlink param

Allow support for 'enable_iwarp' and 'enable_roce' devlink params to turn
on/off iWARP or RoCE protocol support for E800 devices.

For example, a user can turn on iWARP functionality with,

devlink dev param set pci/0000:07:00.0 name enable_iwarp value true cmode runtime

This add an iWARP auxiliary rdma device, ice.iwarp.<>, under this PF.

A user request to enable both iWARP and RoCE under the same PF is rejected
since this device does not support both protocols simultaneously on the
same port.

Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Tested-by: Leszek Kaliszczuk <leszek.kaliszczuk@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/net/intel/iidc.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h
index e32f6712aee0..1289593411d3 100644
--- a/include/linux/net/intel/iidc.h
+++ b/include/linux/net/intel/iidc.h
@@ -26,6 +26,11 @@ enum iidc_reset_type {
 	IIDC_GLOBR,
 };
 
+enum iidc_rdma_protocol {
+	IIDC_RDMA_PROTOCOL_IWARP = BIT(0),
+	IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1),
+};
+
 #define IIDC_MAX_USER_PRIORITY		8
 
 /* Struct to hold per RDMA Qset info */
@@ -70,8 +75,6 @@ int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type);
 int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable);
 void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos);
 
-#define IIDC_RDMA_ROCE_NAME	"roce"
-
 /* Structure representing auxiliary driver tailored information about the core
  * PCI dev, each auxiliary driver using the IIDC interface will have an
  * instance of this struct dedicated to it.
-- 
cgit v1.2.3


From 6326948f940dc3f77066d5cdc44ba6afe67830c0 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 29 Sep 2021 11:01:21 -0400
Subject: lsm: security_task_getsecid_subj() ->
 security_current_getsecid_subj()

The security_task_getsecid_subj() LSM hook invites misuse by allowing
callers to specify a task even though the hook is only safe when the
current task is referenced.  Fix this by removing the task_struct
argument to the hook, requiring LSM implementations to use the
current task.  While we are changing the hook declaration we also
rename the function to security_current_getsecid_subj() in an effort
to reinforce that the hook captures the subjective credentials of the
current task and not an arbitrary task on the system.

Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 3 +--
 include/linux/lsm_hooks.h     | 8 +++-----
 include/linux/security.h      | 4 ++--
 3 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index df8de62f4710..ae2228f0711d 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -206,8 +206,7 @@ LSM_HOOK(int, 0, task_fix_setgid, struct cred *new, const struct cred * old,
 LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid)
 LSM_HOOK(int, 0, task_getpgid, struct task_struct *p)
 LSM_HOOK(int, 0, task_getsid, struct task_struct *p)
-LSM_HOOK(void, LSM_RET_VOID, task_getsecid_subj,
-	 struct task_struct *p, u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, current_getsecid_subj, u32 *secid)
 LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj,
 	 struct task_struct *p, u32 *secid)
 LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index d45b6f6e27fd..52c1990644b9 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -719,11 +719,9 @@
  *	@p.
  *	@p contains the task_struct for the process.
  *	Return 0 if permission is granted.
- * @task_getsecid_subj:
- *	Retrieve the subjective security identifier of the task_struct in @p
- *	and return it in @secid.  Special care must be taken to ensure that @p
- *	is the either the "current" task, or the caller has exclusive access
- *	to @p.
+ * @current_getsecid_subj:
+ *	Retrieve the subjective security identifier of the current task and
+ *	return it in @secid.
  *	In case of failure, @secid will be set to zero.
  * @task_getsecid_obj:
  *	Retrieve the objective security identifier of the task_struct in @p
diff --git a/include/linux/security.h b/include/linux/security.h
index bbf44a466832..bb301963e333 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -418,7 +418,7 @@ int security_task_fix_setgid(struct cred *new, const struct cred *old,
 int security_task_setpgid(struct task_struct *p, pid_t pgid);
 int security_task_getpgid(struct task_struct *p);
 int security_task_getsid(struct task_struct *p);
-void security_task_getsecid_subj(struct task_struct *p, u32 *secid);
+void security_current_getsecid_subj(u32 *secid);
 void security_task_getsecid_obj(struct task_struct *p, u32 *secid);
 int security_task_setnice(struct task_struct *p, int nice);
 int security_task_setioprio(struct task_struct *p, int ioprio);
@@ -1119,7 +1119,7 @@ static inline int security_task_getsid(struct task_struct *p)
 	return 0;
 }
 
-static inline void security_task_getsecid_subj(struct task_struct *p, u32 *secid)
+static inline void security_current_getsecid_subj(u32 *secid)
 {
 	*secid = 0;
 }
-- 
cgit v1.2.3


From 91389c390521a02ecfb91270f5b9d7fae4312ae5 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel@sholland.org>
Date: Thu, 18 Nov 2021 21:33:37 -0600
Subject: clk: sunxi-ng: Allow the CCU core to be built as a module

Like the individual CCU drivers, it can be beneficial for memory
consumption of cross-platform configurations to only load the CCU core
on the relevant platform. For example, a generic arm64 kernel sees the
following improvement when building the CCU core and drivers as modules:

  before:
    text      data     bss     dec       hex      filename
    13882360  5251670  360800  19494830  12977ae  vmlinux

  after:
    text      data     bss     dec       hex      filename
    13734787  5086442  360800  19182029  124b1cd  vmlinux

So the result is a 390KB total reduction in kernel image size.

The one early clock provider (sun5i) requires the core to be built in.

Now that loading the MMC driver will trigger loading the CCU core, the
MMC timing mode functions do not need a compile-time fallback.

Signed-off-by: Samuel Holland <samuel@sholland.org>
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://lore.kernel.org/r/20211119033338.25486-5-samuel@sholland.org
---
 include/linux/clk/sunxi-ng.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk/sunxi-ng.h b/include/linux/clk/sunxi-ng.h
index 3cd14acde0a1..cf32123b39f5 100644
--- a/include/linux/clk/sunxi-ng.h
+++ b/include/linux/clk/sunxi-ng.h
@@ -6,22 +6,7 @@
 #ifndef _LINUX_CLK_SUNXI_NG_H_
 #define _LINUX_CLK_SUNXI_NG_H_
 
-#include <linux/errno.h>
-
-#ifdef CONFIG_SUNXI_CCU
 int sunxi_ccu_set_mmc_timing_mode(struct clk *clk, bool new_mode);
 int sunxi_ccu_get_mmc_timing_mode(struct clk *clk);
-#else
-static inline int sunxi_ccu_set_mmc_timing_mode(struct clk *clk,
-						bool new_mode)
-{
-	return -ENOTSUPP;
-}
-
-static inline int sunxi_ccu_get_mmc_timing_mode(struct clk *clk)
-{
-	return -ENOTSUPP;
-}
-#endif
 
 #endif
-- 
cgit v1.2.3


From c214f124161d446b340597e7c968e0a2dc149142 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Tue, 9 Nov 2021 19:57:10 +0000
Subject: arch_topology: Introduce thermal pressure update function

The thermal pressure is a mechanism which is used for providing
information about reduced CPU performance to the scheduler. Usually code
has to convert the value from frequency units into capacity units,
which are understandable by the scheduler. Create a common conversion code
which can be just used via a handy API.

Internally, the topology_update_thermal_pressure() operates on frequency
in MHz and max CPU frequency is taken from 'freq_factor' (per-cpu).

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Thara Gopinath <thara.gopinath@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/arch_topology.h  | 3 +++
 include/linux/sched/topology.h | 7 +++++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index b97cea83b25e..ace1e5dcf773 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -59,6 +59,9 @@ static inline unsigned long topology_get_thermal_pressure(int cpu)
 void topology_set_thermal_pressure(const struct cpumask *cpus,
 				   unsigned long th_pressure);
 
+void topology_update_thermal_pressure(const struct cpumask *cpus,
+				      unsigned long capped_freq);
+
 struct cpu_topology {
 	int thread_id;
 	int core_id;
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index c07bfa2d80f2..6e89a8e43aa7 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -273,6 +273,13 @@ void arch_set_thermal_pressure(const struct cpumask *cpus,
 { }
 #endif
 
+#ifndef arch_update_thermal_pressure
+static __always_inline
+void arch_update_thermal_pressure(const struct cpumask *cpus,
+				  unsigned long capped_frequency)
+{ }
+#endif
+
 static inline int task_node(const struct task_struct *p)
 {
 	return cpu_to_node(task_cpu(p));
-- 
cgit v1.2.3


From 7e97b3dc2556743dd02612c92a8de7026e8d7dc9 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Tue, 9 Nov 2021 19:57:14 +0000
Subject: arch_topology: Remove unused topology_set_thermal_pressure() and
 related

There is no need of this function (and related) since code has been
converted to use the new arch_update_thermal_pressure() API. The old
code can be removed.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/arch_topology.h  | 3 ---
 include/linux/sched/topology.h | 7 -------
 2 files changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index ace1e5dcf773..cce6136b300a 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -56,9 +56,6 @@ static inline unsigned long topology_get_thermal_pressure(int cpu)
 	return per_cpu(thermal_pressure, cpu);
 }
 
-void topology_set_thermal_pressure(const struct cpumask *cpus,
-				   unsigned long th_pressure);
-
 void topology_update_thermal_pressure(const struct cpumask *cpus,
 				      unsigned long capped_freq);
 
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 6e89a8e43aa7..8054641c0a7b 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -266,13 +266,6 @@ unsigned long arch_scale_thermal_pressure(int cpu)
 }
 #endif
 
-#ifndef arch_set_thermal_pressure
-static __always_inline
-void arch_set_thermal_pressure(const struct cpumask *cpus,
-			       unsigned long th_pressure)
-{ }
-#endif
-
 #ifndef arch_update_thermal_pressure
 static __always_inline
 void arch_update_thermal_pressure(const struct cpumask *cpus,
-- 
cgit v1.2.3


From cff6f593251cdf5398dc3c57f7032b8e9dcb633e Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Tue, 23 Nov 2021 12:36:47 +0200
Subject: regulator: rohm-generic: iniline stub function

The function rohm_regulator_set_voltage_sel_restricted() has a stub
implementation. Linux-next testing spot following:

include/linux/mfd/rohm-generic.h:93:12: error:
'rohm_regulator_set_voltage_sel_restricted' defined but not used

Fix this by inlining the stub.

Fixes: 8b6e88555971 ("regulator: rohm-regulator: add helper for restricted voltage setting")
Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/YZzEP3S7U15bTDAI@fedora
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/rohm-generic.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
index 35c5866f48b7..080d60adcd5f 100644
--- a/include/linux/mfd/rohm-generic.h
+++ b/include/linux/mfd/rohm-generic.h
@@ -90,7 +90,8 @@ static inline int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dv
 {
 	return 0;
 }
-static int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev,
+
+static inline int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev,
 						     unsigned int sel)
 {
 	return 0;
-- 
cgit v1.2.3


From 2106efda785b55a8957efed9a52dfa28ee0d7280 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 22 Nov 2021 17:24:47 -0800
Subject: net: remove .ndo_change_proto_down

.ndo_change_proto_down was added seemingly to enable out-of-tree
implementations. Over 2.5yrs later we still have no real users
upstream. Hardwire the generic implementation for now, we can
revert once real users materialize. (rocker is a test vehicle,
not a user.)

We need to drop the optimization on the sysfs side, because
unlike ndos priv_flags will be changed at runtime, so we'd
need READ_ONCE/WRITE_ONCE everywhere..

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index df049864661d..db3bff1ae7fd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1297,11 +1297,6 @@ struct netdev_net_notifier {
  *	TX queue.
  * int (*ndo_get_iflink)(const struct net_device *dev);
  *	Called to get the iflink value of this device.
- * void (*ndo_change_proto_down)(struct net_device *dev,
- *				 bool proto_down);
- *	This function is used to pass protocol port error state information
- *	to the switch driver. The switch driver can react to the proto_down
- *      by doing a phys down on the associated switch port.
  * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);
  *	This function is used to get egress tunnel information for given skb.
  *	This is useful for retrieving outer tunnel header parameters while
@@ -1542,8 +1537,6 @@ struct net_device_ops {
 						      int queue_index,
 						      u32 maxrate);
 	int			(*ndo_get_iflink)(const struct net_device *dev);
-	int			(*ndo_change_proto_down)(struct net_device *dev,
-							 bool proto_down);
 	int			(*ndo_fill_metadata_dst)(struct net_device *dev,
 						       struct sk_buff *skb);
 	void			(*ndo_set_rx_headroom)(struct net_device *dev,
@@ -1612,6 +1605,7 @@ struct net_device_ops {
  * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
  * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
  *	skb_headlen(skb) == 0 (data starts from frag0)
+ * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1646,6 +1640,7 @@ enum netdev_priv_flags {
 	IFF_L3MDEV_RX_HANDLER		= 1<<29,
 	IFF_LIVE_RENAME_OK		= 1<<30,
 	IFF_TX_SKB_NO_LINEAR		= 1<<31,
+	IFF_CHANGE_PROTO_DOWN		= BIT_ULL(32),
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1982,7 +1977,7 @@ struct net_device {
 
 	/* Read-mostly cache-line for fast-path access */
 	unsigned int		flags;
-	unsigned int		priv_flags;
+	unsigned long long	priv_flags;
 	const struct net_device_ops *netdev_ops;
 	int			ifindex;
 	unsigned short		gflags;
@@ -3735,7 +3730,6 @@ int dev_get_port_parent_id(struct net_device *dev,
 			   struct netdev_phys_item_id *ppid, bool recurse);
 bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-int dev_change_proto_down_generic(struct net_device *dev, bool proto_down);
 void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
 				  u32 value);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
-- 
cgit v1.2.3


From 985e9ece1e55a94da842f6c1f9ff84d587b26267 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 17 Nov 2021 20:07:35 +0200
Subject: ACPI: Make acpi_node_get_parent() local

acpi_node_get_parent() isn't used outside drivers/acpi/property.c.

Make it local.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 668d007f0917..b28f8790192a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1182,7 +1182,6 @@ int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
 
 struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
 					    struct fwnode_handle *child);
-struct fwnode_handle *acpi_node_get_parent(const struct fwnode_handle *fwnode);
 
 struct acpi_probe_entry;
 typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
@@ -1287,12 +1286,6 @@ acpi_get_next_subnode(const struct fwnode_handle *fwnode,
 	return NULL;
 }
 
-static inline struct fwnode_handle *
-acpi_node_get_parent(const struct fwnode_handle *fwnode)
-{
-	return NULL;
-}
-
 static inline struct fwnode_handle *
 acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
 			     struct fwnode_handle *prev)
-- 
cgit v1.2.3


From c6d5f1933085f9a92ed5c256a859ab31c7a35f88 Mon Sep 17 00:00:00 2001
From: Kurt Kanzenbach <kurt@linutronix.de>
Date: Mon, 22 Nov 2021 12:19:31 +0100
Subject: net: stmmac: Calculate CDC error only once

The clock domain crossing error (CDC) is calculated at every fetch of Tx or Rx
timestamps. It includes a division. Especially on arm32 based systems it is
expensive. It also requires two conditionals in the hotpath.

Add a compensation value cache to struct plat_stmmacenet_data and subtract it
unconditionally in the RX/TX functions which spares the conditionals.

The value is initialized to 0 and if supported calculated in the PTP
initialization code.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
Link: https://lore.kernel.org/r/20211122111931.135135-1-kurt@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/stmmac.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index a6f03b36fc4f..89b8e208cd7b 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -241,6 +241,7 @@ struct plat_stmmacenet_data {
 	unsigned int clk_ref_rate;
 	unsigned int mult_fact_100ns;
 	s32 ptp_max_adj;
+	u32 cdc_error_adj;
 	struct reset_control *stmmac_rst;
 	struct reset_control *stmmac_ahb_rst;
 	struct stmmac_axi *axi;
-- 
cgit v1.2.3


From 393c3714081a53795bbff0e985d24146def6f57f Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 18 Nov 2021 15:00:08 -0800
Subject: kernfs: switch global kernfs_rwsem lock to per-fs lock

The kernfs implementation has big lock granularity(kernfs_rwsem) so
every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the
lock. It makes trouble for some cases to wait the global lock
for a long time even though they are totally independent contexts
each other.

A general example is process A goes under direct reclaim with holding
the lock when it accessed the file in sysfs and process B is waiting
the lock with exclusive mode and then process C is waiting the lock
until process B could finish the job after it gets the lock from
process A.

This patch switches the global kernfs_rwsem to per-fs lock, which
put the rwsem into kernfs_root.

Suggested-by: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Link: https://lore.kernel.org/r/20211118230008.2679780-1-minchan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kernfs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 3ccce6f24548..9f650986a81b 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -16,6 +16,7 @@
 #include <linux/atomic.h>
 #include <linux/uidgid.h>
 #include <linux/wait.h>
+#include <linux/rwsem.h>
 
 struct file;
 struct dentry;
@@ -197,6 +198,7 @@ struct kernfs_root {
 	struct list_head	supers;
 
 	wait_queue_head_t	deactivate_waitq;
+	struct rw_semaphore	kernfs_rwsem;
 };
 
 struct kernfs_open_file {
-- 
cgit v1.2.3


From 1b6ed6bf32fb22ef8e3572fc9c0f6454adf1ca40 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Wed, 24 Nov 2021 09:17:37 +0200
Subject: regulator: Drop unnecessary struct member

The irq_flags from the regulator IRQ helper description struct was never
used. The IRQ flags are passed as parameters to helper registration
instead.

Remove the unnecessary struct field.

Fixes: 7111c6d1b31b ("regulator: IRQ based event/error notification helpers")
Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/5f6371e178453fa2b165da50452f7db4e986debb.1637736436.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 1cb8071fee34..53b25cd7ead0 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -554,7 +554,6 @@ struct regulator_irq_data {
  */
 struct regulator_irq_desc {
 	const char *name;
-	int irq_flags;
 	int fatal_cnt;
 	int reread_ms;
 	int irq_off_ms;
-- 
cgit v1.2.3


From 6fadec4c5561e2fbe1dfa8a7da9bc58d094a8f04 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Wed, 24 Nov 2021 09:16:45 +0200
Subject: regulator: Add regulator_err2notif() helper

Help drivers avoid storing both supported notification and supported error
flags by supporting conversion from regulator error to notification.
This may help saving some bytes.

Add helper for finding the regulator notification corresponding to a
regulator error.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/eb1755ac0569ff07ffa466cf8912c6fd50e7c7c6.1637736436.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 53b25cd7ead0..6c6ec9658c30 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -645,6 +645,40 @@ struct regulator_dev {
 	spinlock_t err_lock;
 };
 
+/*
+ * Convert error flags to corresponding notifications.
+ *
+ * Can be used by drivers which use the notification helpers to
+ * find out correct notification flags based on the error flags. Drivers
+ * can avoid storing both supported notification and error flags which
+ * may save few bytes.
+ */
+static inline int regulator_err2notif(int err)
+{
+	switch (err) {
+	case REGULATOR_ERROR_UNDER_VOLTAGE:
+		return REGULATOR_EVENT_UNDER_VOLTAGE;
+	case REGULATOR_ERROR_OVER_CURRENT:
+		return REGULATOR_EVENT_OVER_CURRENT;
+	case REGULATOR_ERROR_REGULATION_OUT:
+		return REGULATOR_EVENT_REGULATION_OUT;
+	case REGULATOR_ERROR_FAIL:
+		return REGULATOR_EVENT_FAIL;
+	case REGULATOR_ERROR_OVER_TEMP:
+		return REGULATOR_EVENT_OVER_TEMP;
+	case REGULATOR_ERROR_UNDER_VOLTAGE_WARN:
+		return REGULATOR_EVENT_UNDER_VOLTAGE_WARN;
+	case REGULATOR_ERROR_OVER_CURRENT_WARN:
+		return REGULATOR_EVENT_OVER_CURRENT_WARN;
+	case REGULATOR_ERROR_OVER_VOLTAGE_WARN:
+		return REGULATOR_EVENT_OVER_VOLTAGE_WARN;
+	case REGULATOR_ERROR_OVER_TEMP_WARN:
+		return REGULATOR_EVENT_OVER_TEMP_WARN;
+	}
+	return 0;
+}
+
+
 struct regulator_dev *
 regulator_register(const struct regulator_desc *regulator_desc,
 		   const struct regulator_config *config);
-- 
cgit v1.2.3


From a764ff77d697a4a13e69b3379cc613f7409c6b9a Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Wed, 24 Nov 2021 09:17:13 +0200
Subject: regulator: irq_helper: Provide helper for trivial IRQ notifications

Provide a generic map_event helper for regulators which have a notification
IRQ with single, well defined purpose. Eg, IRQ always indicates exactly one
event for exactly one regulator device. For such IRQs the mapping is
trivial.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/603b7ed1938013a00371c1e7ccc63dfb16982b87.1637736436.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 6c6ec9658c30..4078c7776453 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -700,6 +700,8 @@ void *regulator_irq_helper(struct device *dev,
 			   int irq_flags, int common_errs, int *per_rdev_errs,
 			   struct regulator_dev **rdev, int rdev_amount);
 void regulator_irq_helper_cancel(void **handle);
+int regulator_irq_map_event_simple(int irq, struct regulator_irq_data *rid,
+				   unsigned long *dev_mask);
 
 void *rdev_get_drvdata(struct regulator_dev *rdev);
 struct device *rdev_get_dev(struct regulator_dev *rdev);
-- 
cgit v1.2.3


From 432dd1fc134ef902b049b26839edfd3fdc1f8dc0 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Wed, 24 Nov 2021 07:57:49 +0200
Subject: regulator: rohm-generic: remove unused dummies

Function rohm_regulator_set_voltage_sel_restricted() and
rohm_regulator_set_dvs_levels() had inlined dummy implementations for
cases when the real implementation was not configured in.

All of the drivers who issue the call to these functions do SELECT the
real implementation from the Kconfig. There should be no cases where the
real implementation was not selected by the drivers using these
functions - such a situation is likely to be an error which deserves to be
noticed at compile-time.

These dummies could in theory be used for compile-testing the drivers
only (without the generic rohm regulator pieces). However, for such
compile testing we should manually drop the selection from KConfig - and
I guess that if it does not work out-of-the-box, then it is not going to
happen. Especially when there should be no reason to omit
compile-testing the generic rohm_regulator part.

Crash test dummies.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/YZ3UXXrk/Efe7Scj@fedora
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/rohm-generic.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
index 080d60adcd5f..5ed97a1d0908 100644
--- a/include/linux/mfd/rohm-generic.h
+++ b/include/linux/mfd/rohm-generic.h
@@ -82,20 +82,6 @@ int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dvs,
 
 int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev,
 					      unsigned int sel);
-#else
-static inline int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dvs,
-						struct device_node *np,
-						const struct regulator_desc *desc,
-						struct regmap *regmap)
-{
-	return 0;
-}
-
-static inline int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev,
-						     unsigned int sel)
-{
-	return 0;
-}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 2338e7bcef445059a99848a3eddde0b556277663 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 15 Nov 2021 15:10:01 +0300
Subject: device property: Remove device_add_properties() API

There are no more users for it.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/property.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 88fa726a76df..16f736c698a2 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -378,10 +378,6 @@ property_entries_dup(const struct property_entry *properties);
 
 void property_entries_free(const struct property_entry *properties);
 
-int device_add_properties(struct device *dev,
-			  const struct property_entry *properties);
-void device_remove_properties(struct device *dev);
-
 bool device_dma_supported(struct device *dev);
 
 enum dev_dma_attr device_get_dma_attr(struct device *dev);
-- 
cgit v1.2.3


From f124034faa911ed534bf8c4881ad98dbbde2a966 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 24 Nov 2021 18:44:17 -0500
Subject: Revert "virtio_ring: validate used buffer length"

This reverts commit 939779f5152d161b34f612af29e7dc1ac4472fcf.

Attempts to validate length in the core did not work out: there turn out
to exist multiple broken devices, and in particular legacy devices are
known to be broken in this respect.

We have ideas for handling this better in the next version but for now
let's revert to a known good state to make sure drivers work for people.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 44d0e09da2d9..41edbc01ffa4 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -152,7 +152,6 @@ size_t virtio_max_dma_size(struct virtio_device *vdev);
  * @feature_table_size: number of entries in the feature table array.
  * @feature_table_legacy: same as feature_table but when working in legacy mode.
  * @feature_table_size_legacy: number of entries in feature table legacy array.
- * @suppress_used_validation: set to not have core validate used length
  * @probe: the function to call when a device is found.  Returns 0 or -errno.
  * @scan: optional function to call after successful probe; intended
  *    for virtio-scsi to invoke a scan.
@@ -169,7 +168,6 @@ struct virtio_driver {
 	unsigned int feature_table_size;
 	const unsigned int *feature_table_legacy;
 	unsigned int feature_table_size_legacy;
-	bool suppress_used_validation;
 	int (*validate)(struct virtio_device *dev);
 	int (*probe)(struct virtio_device *dev);
 	void (*scan)(struct virtio_device *dev);
-- 
cgit v1.2.3


From 635e4172bd0a43af943fb164799965fc9a9a705d Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Tue, 2 Nov 2021 07:38:10 +0100
Subject: arm: remove zte zx platform left-over

Commit 89d4f98ae90d ("ARM: remove zte zx platform") missed to remove some
definitions for this platform's debug and serial, e.g., code dependent on
the config DEBUG_ZTE_ZX.

Fortunately, ./scripts/checkkconfigsymbols.py detects this and warns:

DEBUG_ZTE_ZX
Referencing files: arch/arm/include/debug/pl01x.S

Further review by Arnd Bergmann identified even more dead code in the
amba serial driver.

Remove all this left-over from the zte zx platform.

Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Link: https://lore.kernel.org/r/20211102063810.932-1-lukas.bulwahn@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/amba/bus.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index edfcf7a14dcd..6c7f47846971 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -90,14 +90,8 @@ enum amba_vendor {
 	AMBA_VENDOR_ST = 0x80,
 	AMBA_VENDOR_QCOM = 0x51,
 	AMBA_VENDOR_LSI = 0xb6,
-	AMBA_VENDOR_LINUX = 0xfe,	/* This value is not official */
 };
 
-/* This is used to generate pseudo-ID for AMBA device */
-#define AMBA_LINUX_ID(conf, rev, part) \
-	(((conf) & 0xff) << 24 | ((rev) & 0xf) << 20 | \
-	AMBA_VENDOR_LINUX << 12 | ((part) & 0xfff))
-
 extern struct bus_type amba_bustype;
 
 #define to_amba_device(d)	container_of(d, struct amba_device, dev)
-- 
cgit v1.2.3


From 4167bd25ec3bc221387ec6811c05eadfe3cf1d3e Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 18 Nov 2021 08:31:24 +0100
Subject: mxser: move ids from pci_ids.h here

There is no point having MOXA PCI device IDs in include/linux/pci_ids.h.
Move them to the driver and sort them all by the ID.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: linux-pci@vger.kernel.org
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211118073125.12283-19-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/pci_ids.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 011f2f1ea5bb..c389a9c0f290 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1964,24 +1964,6 @@
 #define PCI_DEVICE_ID_APPLICOM_PCI2000PFB 0x0003
 
 #define PCI_VENDOR_ID_MOXA		0x1393
-#define PCI_DEVICE_ID_MOXA_RC7000	0x0001
-#define PCI_DEVICE_ID_MOXA_CP102	0x1020
-#define PCI_DEVICE_ID_MOXA_CP102UL	0x1021
-#define PCI_DEVICE_ID_MOXA_CP102U	0x1022
-#define PCI_DEVICE_ID_MOXA_C104		0x1040
-#define PCI_DEVICE_ID_MOXA_CP104U	0x1041
-#define PCI_DEVICE_ID_MOXA_CP104JU	0x1042
-#define PCI_DEVICE_ID_MOXA_CP104EL	0x1043
-#define PCI_DEVICE_ID_MOXA_CT114	0x1140
-#define PCI_DEVICE_ID_MOXA_CP114	0x1141
-#define PCI_DEVICE_ID_MOXA_CP118U	0x1180
-#define PCI_DEVICE_ID_MOXA_CP118EL	0x1181
-#define PCI_DEVICE_ID_MOXA_CP132	0x1320
-#define PCI_DEVICE_ID_MOXA_CP132U	0x1321
-#define PCI_DEVICE_ID_MOXA_CP134U	0x1340
-#define PCI_DEVICE_ID_MOXA_C168		0x1680
-#define PCI_DEVICE_ID_MOXA_CP168U	0x1681
-#define PCI_DEVICE_ID_MOXA_CP168EL	0x1682
 #define PCI_DEVICE_ID_MOXA_CP204J	0x2040
 #define PCI_DEVICE_ID_MOXA_C218		0x2180
 #define PCI_DEVICE_ID_MOXA_C320		0x3200
-- 
cgit v1.2.3


From 5db96ef23bda6c2a61a51693c85b78b52d03f654 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 22 Nov 2021 12:16:48 +0100
Subject: tty: drop tty_schedule_flip()

Since commit a9c3f68f3cd8d (tty: Fix low_latency BUG) in 2014,
tty_flip_buffer_push() is only a wrapper to tty_schedule_flip(). All
users were converted in the previous patches, so remove
tty_schedule_flip() completely while inlining its body into
tty_flip_buffer_push().

One less exported function.

Reviewed-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211122111648.30379-4-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_flip.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h
index 9916acb5de49..483d41cbcbb7 100644
--- a/include/linux/tty_flip.h
+++ b/include/linux/tty_flip.h
@@ -17,7 +17,6 @@ int tty_insert_flip_string_fixed_flag(struct tty_port *port,
 int tty_prepare_flip_string(struct tty_port *port, unsigned char **chars,
 		size_t size);
 void tty_flip_buffer_push(struct tty_port *port);
-void tty_schedule_flip(struct tty_port *port);
 int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag);
 
 static inline int tty_insert_flip_char(struct tty_port *port,
-- 
cgit v1.2.3


From d78328bcc4d0e677f2ff83f4ae1f43c933fbd143 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 22 Nov 2021 10:45:29 +0100
Subject: tty: remove file from tty_ldisc_ops::ioctl and compat_ioctl

After the previous patches, noone needs 'file' parameter in neither
ioctl hook from tty_ldisc_ops. So remove 'file' from both of them.

Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Cc: Wolfgang Grandegger <wg@grandegger.com>
Cc: Marc Kleine-Budde <mkl@pengutronix.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Andreas Koensgen <ajk@comnets.uni-bremen.de>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com> [NFC]
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211122094529.24171-1-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_ldisc.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index b85d84fb5f49..25f07017bbad 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -45,8 +45,7 @@ struct tty_struct;
  *	some processing on the characters first.  If this function is
  *	not defined, the user will receive an EIO error.
  *
- * int	(*ioctl)(struct tty_struct * tty, struct file * file,
- *		 unsigned int cmd, unsigned long arg);
+ * int	(*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg);
  *
  *	This function is called when the user requests an ioctl which
  *	is not handled by the tty layer or the low-level tty driver.
@@ -56,8 +55,8 @@ struct tty_struct;
  *	low-level driver can "grab" an ioctl request before the line
  *	discpline has a chance to see it.
  *
- * int	(*compat_ioctl)(struct tty_struct * tty, struct file * file,
- *		        unsigned int cmd, unsigned long arg);
+ * int	(*compat_ioctl)(struct tty_struct *tty, unsigned int cmd,
+ *			unsigned long arg);
  *
  *	Process ioctl calls from 32-bit process on 64-bit system
  *
@@ -192,10 +191,10 @@ struct tty_ldisc_ops {
 			void **cookie, unsigned long offset);
 	ssize_t	(*write)(struct tty_struct *tty, struct file *file,
 			 const unsigned char *buf, size_t nr);
-	int	(*ioctl)(struct tty_struct *tty, struct file *file,
-			 unsigned int cmd, unsigned long arg);
-	int	(*compat_ioctl)(struct tty_struct *tty, struct file *file,
-				unsigned int cmd, unsigned long arg);
+	int	(*ioctl)(struct tty_struct *tty, unsigned int cmd,
+			unsigned long arg);
+	int	(*compat_ioctl)(struct tty_struct *tty, unsigned int cmd,
+			unsigned long arg);
 	void	(*set_termios)(struct tty_struct *tty, struct ktermios *old);
 	__poll_t (*poll)(struct tty_struct *, struct file *,
 			     struct poll_table_struct *);
-- 
cgit v1.2.3


From 29c3002644bdd653f6ec6407d25135d0a4f7cefb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 24 Nov 2021 12:24:46 -0800
Subject: net: optimize skb_postpull_rcsum()

Remove one pair of add/adc instructions and their dependency
against carry flag.

We can leverage third argument to csum_partial():

  X = csum_block_sub(X, csum_partial(start, len, 0), 0);

  -->

  X = csum_block_add(X, ~csum_partial(start, len, 0), 0);

  -->

  X = ~csum_partial(start, len, ~X);

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index eba256af64a5..eae4bd3237a4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3485,7 +3485,11 @@ __skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
 static inline void skb_postpull_rcsum(struct sk_buff *skb,
 				      const void *start, unsigned int len)
 {
-	__skb_postpull_rcsum(skb, start, len, 0);
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->csum = ~csum_partial(start, len, ~skb->csum);
+	else if (skb->ip_summed == CHECKSUM_PARTIAL &&
+		 skb_checksum_start_offset(skb) < 0)
+		skb->ip_summed = CHECKSUM_NONE;
 }
 
 static __always_inline void
-- 
cgit v1.2.3


From b4c80629c5c9d48880c5ad99943374f9ab72432e Mon Sep 17 00:00:00 2001
From: Heinrich Schuchardt <xypron.glpk@gmx.de>
Date: Sun, 23 May 2021 22:49:57 +0200
Subject: include/linux/byteorder/generic.h: fix index variables

In cpu_to_be32_array() and be32_to_cpu_array() the length of the array is
given by variable len of type size_t. An index variable of type int is used
to iterate over the array. This is bound to fail for len > INT_MAX and
lets GCC add instructions for sign extension.

Correct the type of the index variable.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
Link: https://lore.kernel.org/r/20210523204958.64575-1-xypron.glpk@gmx.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/byteorder/generic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 4b13e0a3e15b..c9a4c96c9943 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -190,7 +190,7 @@ static inline void be64_add_cpu(__be64 *var, u64 val)
 
 static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len)
 {
-	int i;
+	size_t i;
 
 	for (i = 0; i < len; i++)
 		dst[i] = cpu_to_be32(src[i]);
@@ -198,7 +198,7 @@ static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len)
 
 static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len)
 {
-	int i;
+	size_t i;
 
 	for (i = 0; i < len; i++)
 		dst[i] = be32_to_cpu(src[i]);
-- 
cgit v1.2.3


From 18e6c0751cf9ae0631a2623e31af2bf504f72c30 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 26 Nov 2021 09:15:49 +0100
Subject: tty: finish kernel-doc of tty_struct members

There are already pieces of kernel-doc documentation for struct
tty_struct in tty.h. Finish the documentation for the members which were
undocumented yet.

It also includes tuning the already existing pieces like flow and ctrl,
especially adding highlights to them.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211126081611.11001-2-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 79 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 5dbd7c5afac7..da49ad9be281 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -122,33 +122,84 @@ struct tty_operations;
 /**
  * struct tty_struct - state associated with a tty while open
  *
- * @flow.lock: lock for flow members
- * @flow.stopped: tty stopped/started by tty_stop/tty_start
- * @flow.tco_stopped: tty stopped/started by TCOOFF/TCOON ioctls (it has
- *		      precedense over @flow.stopped)
+ * @magic: magic value set early in @alloc_tty_struct to %TTY_MAGIC, for
+ *	   debugging purposes
+ * @kref: reference counting by tty_kref_get() and tty_kref_put(), reaching zero
+ *	  frees the structure
+ * @dev: class device or %NULL (e.g. ptys, serdev)
+ * @driver: &struct tty_driver operating this tty
+ * @ops: &struct tty_operations of @driver for this tty (open, close, etc.)
+ * @index: index of this tty (e.g. to construct @name like tty12)
+ * @ldisc_sem: protects line discipline changes (@ldisc) -- lock tty not pty
+ * @ldisc: the current line discipline for this tty (n_tty by default)
+ * @atomic_write_lock: protects against concurrent writers, i.e. locks
+ *		       @write_cnt, @write_buf and similar
+ * @legacy_mutex: leftover from history (BKL -> BTM -> @legacy_mutex),
+ *		  protecting several operations on this tty
+ * @throttle_mutex: protects against concurrent tty_throttle_safe() and
+ *		    tty_unthrottle_safe() (but not tty_unthrottle())
+ * @termios_rwsem: protects @termios and @termios_locked
+ * @winsize_mutex: protects @winsize
+ * @termios: termios for the current tty, copied from/to @driver.termios
+ * @termios_locked: locked termios (by %TIOCGLCKTRMIOS and %TIOCSLCKTRMIOS
+ *		    ioctls)
+ * @name: name of the tty constructed by tty_line_name() (e.g. ttyS3)
+ * @flags: bitwise OR of %TTY_THROTTLED, %TTY_IO_ERROR, ...
+ * @count: count of open processes, reaching zero cancels all the work for
+ *	   this tty and drops a @kref too (but does not free this tty)
+ * @winsize: size of the terminal "window" (cf. @winsize_mutex)
+ * @flow: flow settings grouped together, see also @flow.unused
+ * @flow.lock: lock for @flow members
+ * @flow.stopped: tty stopped/started by stop_tty()/start_tty()
+ * @flow.tco_stopped: tty stopped/started by %TCOOFF/%TCOON ioctls (it has
+ *		      precedence over @flow.stopped)
  * @flow.unused: alignment for Alpha, so that no members other than @flow.* are
  *		 modified by the same 64b word store. The @flow's __aligned is
  *		 there for the very same reason.
- * @ctrl.lock: lock for ctrl members
+ * @ctrl: control settings grouped together, see also @ctrl.unused
+ * @ctrl.lock: lock for @ctrl members
  * @ctrl.pgrp: process group of this tty (setpgrp(2))
  * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both
- *		  @ctrl.lock and legacy mutex, readers must use at least one of
+ *		  @ctrl.lock and @legacy_mutex, readers must use at least one of
  *		  them.
- * @ctrl.pktstatus: packet mode status (bitwise OR of TIOCPKT_* constants)
+ * @ctrl.pktstatus: packet mode status (bitwise OR of %TIOCPKT_ constants)
  * @ctrl.packet: packet mode enabled
+ * @ctrl.unused: alignment for Alpha, see @flow.unused for explanation
+ * @hw_stopped: not controlled by the tty layer, under @driver's control for CTS
+ *		handling
+ * @receive_room: bytes permitted to feed to @ldisc without any being lost
+ * @flow_change: controls behavior of throttling, see tty_throttle_safe() and
+ *		 tty_unthrottle_safe()
+ * @link: link to another pty (master -> slave and vice versa)
+ * @fasync: state for %O_ASYNC (for %SIGIO); managed by fasync_helper()
+ * @write_wait: concurrent writers are waiting in this queue until they are
+ *		allowed to write
+ * @read_wait: readers wait for data in this queue
+ * @hangup_work: normally a work to perform a hangup (do_tty_hangup()); while
+ *		 freeing the tty, (re)used to release_one_tty()
+ * @disc_data: pointer to @ldisc's private data (e.g. to &struct n_tty_data)
+ * @driver_data: pointer to @driver's private data (e.g. &struct uart_state)
+ * @files_lock:	protects @tty_files list
+ * @tty_files: list of (re)openers of this tty (i.e. linked &struct
+ *	       tty_file_private)
+ * @closing: when set during close, n_tty processes only START & STOP chars
+ * @write_buf: temporary buffer used during tty_write() to copy user data to
+ * @write_cnt: count of bytes written in tty_write() to @write_buf
+ * @SAK_work: if the tty has a pending do_SAK, it is queued here
+ * @port: persistent storage for this device (i.e. &struct tty_port)
  *
  * All of the state associated with a tty while the tty is open. Persistent
- * storage for tty devices is referenced here as @port in struct tty_port.
+ * storage for tty devices is referenced here as @port and is documented in
+ * &struct tty_port.
  */
 struct tty_struct {
 	int	magic;
 	struct kref kref;
-	struct device *dev;	/* class device or NULL (e.g. ptys, serdev) */
+	struct device *dev;
 	struct tty_driver *driver;
 	const struct tty_operations *ops;
 	int index;
 
-	/* Protects ldisc changes: Lock tty not pty */
 	struct ld_semaphore ldisc_sem;
 	struct tty_ldisc *ldisc;
 
@@ -157,12 +208,11 @@ struct tty_struct {
 	struct mutex throttle_mutex;
 	struct rw_semaphore termios_rwsem;
 	struct mutex winsize_mutex;
-	/* Termios values are protected by the termios rwsem */
 	struct ktermios termios, termios_locked;
 	char name[64];
 	unsigned long flags;
 	int count;
-	struct winsize winsize;		/* winsize_mutex */
+	struct winsize winsize;
 
 	struct {
 		spinlock_t lock;
@@ -181,7 +231,7 @@ struct tty_struct {
 	} __aligned(sizeof(unsigned long)) ctrl;
 
 	int hw_stopped;
-	unsigned int receive_room;	/* Bytes free for queue */
+	unsigned int receive_room;
 	int flow_change;
 
 	struct tty_struct *link;
@@ -191,7 +241,7 @@ struct tty_struct {
 	struct work_struct hangup_work;
 	void *disc_data;
 	void *driver_data;
-	spinlock_t files_lock;		/* protects tty_files list */
+	spinlock_t files_lock;
 	struct list_head tty_files;
 
 #define N_TTY_BUF_SIZE 4096
@@ -199,7 +249,6 @@ struct tty_struct {
 	int closing;
 	unsigned char *write_buf;
 	int write_cnt;
-	/* If the tty has a pending do_SAK, queue it here - akpm */
 	struct work_struct SAK_work;
 	struct tty_port *port;
 } __randomize_layout;
-- 
cgit v1.2.3


From 61c83addb77c65f498a8db0e7113dc3acf753c45 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 26 Nov 2021 09:15:50 +0100
Subject: tty: add kernel-doc for tty_port

tty_port used to have only short comments along its members. Convert
them into proper kernel-doc comments in front of the structure. And add
some more explanation to them where needed.

The whole structure purpose and handling is documented at the end too --
some pieces of preexisting text moved to this place.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211126081611.11001-3-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_port.h | 104 +++++++++++++++++++++++++++++++----------------
 1 file changed, 70 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h
index 6e86e9e118b6..9091e1c8de4c 100644
--- a/include/linux/tty_port.h
+++ b/include/linux/tty_port.h
@@ -7,17 +7,6 @@
 #include <linux/tty_buffer.h>
 #include <linux/wait.h>
 
-/*
- * Port level information. Each device keeps its own port level information
- * so provide a common structure for those ports wanting to use common support
- * routines.
- *
- * The tty port has a different lifetime to the tty so must be kept apart.
- * In addition be careful as tty -> port mappings are valid for the life
- * of the tty object but in many cases port -> tty mappings are valid only
- * until a hangup so don't use the wrong path.
- */
-
 struct attribute_group;
 struct tty_driver;
 struct tty_port;
@@ -48,30 +37,77 @@ struct tty_port_client_operations {
 
 extern const struct tty_port_client_operations tty_port_default_client_ops;
 
+/**
+ * struct tty_port -- port level information
+ *
+ * @buf: buffer for this port, locked internally
+ * @tty: back pointer to &struct tty_struct, valid only if the tty is open. Use
+ *	 tty_port_tty_get() to obtain it (and tty_kref_put() to release).
+ * @itty: internal back pointer to &struct tty_struct. Avoid this. It should be
+ *	  eliminated in the long term.
+ * @ops: tty port operations (like activate, shutdown), see &struct
+ *	 tty_port_operations
+ * @client_ops: tty port client operations (like receive_buf, write_wakeup).
+ *		By default, tty_port_default_client_ops is used.
+ * @lock: lock protecting @tty
+ * @blocked_open: # of procs waiting for open in tty_port_block_til_ready()
+ * @count: usage count
+ * @open_wait: open waiters queue (waiting e.g. for a carrier)
+ * @delta_msr_wait: modem status change queue (waiting for MSR changes)
+ * @flags: user TTY flags (%ASYNC_)
+ * @iflags: internal flags (%TTY_PORT_)
+ * @console: when set, the port is a console
+ * @mutex: locking, for open, shutdown and other port operations
+ * @buf_mutex: @xmit_buf alloc lock
+ * @xmit_buf: optional xmit buffer used by some drivers
+ * @close_delay: delay in jiffies to wait when closing the port
+ * @closing_wait: delay in jiffies for output to be sent before closing
+ * @drain_delay: set to zero if no pure time based drain is needed else set to
+ *		 size of fifo
+ * @kref: references counter. Reaching zero calls @ops->destruct() if non-%NULL
+ *	  or frees the port otherwise.
+ * @client_data: pointer to private data, for @client_ops
+ *
+ * Each device keeps its own port level information. &struct tty_port was
+ * introduced as a common structure for such information. As every TTY device
+ * shall have a backing tty_port structure, every driver can use these members.
+ *
+ * The tty port has a different lifetime to the tty so must be kept apart.
+ * In addition be careful as tty -> port mappings are valid for the life
+ * of the tty object but in many cases port -> tty mappings are valid only
+ * until a hangup so don't use the wrong path.
+ *
+ * Tty port shall be initialized by tty_port_init() and shut down either by
+ * tty_port_destroy() (refcounting not used), or tty_port_put() (refcounting).
+ *
+ * There is a lot of helpers around &struct tty_port too. To name the most
+ * significant ones: tty_port_open(), tty_port_close() (or
+ * tty_port_close_start() and tty_port_close_end() separately if need be), and
+ * tty_port_hangup(). These call @ops->activate() and @ops->shutdown() as
+ * needed.
+ */
 struct tty_port {
-	struct tty_bufhead	buf;		/* Locked internally */
-	struct tty_struct	*tty;		/* Back pointer */
-	struct tty_struct	*itty;		/* internal back ptr */
-	const struct tty_port_operations *ops;	/* Port operations */
-	const struct tty_port_client_operations *client_ops; /* Port client operations */
-	spinlock_t		lock;		/* Lock protecting tty field */
-	int			blocked_open;	/* Waiting to open */
-	int			count;		/* Usage count */
-	wait_queue_head_t	open_wait;	/* Open waiters */
-	wait_queue_head_t	delta_msr_wait;	/* Modem status change */
-	unsigned long		flags;		/* User TTY flags ASYNC_ */
-	unsigned long		iflags;		/* Internal flags TTY_PORT_ */
-	unsigned char		console:1;	/* port is a console */
-	struct mutex		mutex;		/* Locking */
-	struct mutex		buf_mutex;	/* Buffer alloc lock */
-	unsigned char		*xmit_buf;	/* Optional buffer */
-	unsigned int		close_delay;	/* Close port delay */
-	unsigned int		closing_wait;	/* Delay for output */
-	int			drain_delay;	/* Set to zero if no pure time
-						   based drain is needed else
-						   set to size of fifo */
-	struct kref		kref;		/* Ref counter */
-	void 			*client_data;
+	struct tty_bufhead	buf;
+	struct tty_struct	*tty;
+	struct tty_struct	*itty;
+	const struct tty_port_operations *ops;
+	const struct tty_port_client_operations *client_ops;
+	spinlock_t		lock;
+	int			blocked_open;
+	int			count;
+	wait_queue_head_t	open_wait;
+	wait_queue_head_t	delta_msr_wait;
+	unsigned long		flags;
+	unsigned long		iflags;
+	unsigned char		console:1;
+	struct mutex		mutex;
+	struct mutex		buf_mutex;
+	unsigned char		*xmit_buf;
+	unsigned int		close_delay;
+	unsigned int		closing_wait;
+	int			drain_delay;
+	struct kref		kref;
+	void			*client_data;
 };
 
 /* tty_port::iflags bits -- use atomic bit ops */
-- 
cgit v1.2.3


From a6563830215226aae0e7e6802955c77a6a7b7547 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 26 Nov 2021 09:15:51 +0100
Subject: tty: add kernel-doc for tty_driver

tty_driver used to have only short comments along its members. Convert
them into proper kernel-doc comments in front of the structure. And add
some more explanation to them where needed.

The whole structure handling is documented at the end too.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211126081611.11001-4-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 62 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 795b94ccdeb6..3622404a678d 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -291,23 +291,61 @@ struct tty_operations {
 	int (*proc_show)(struct seq_file *, void *);
 } __randomize_layout;
 
+/**
+ * struct tty_driver -- driver for TTY devices
+ *
+ * @magic: set to %TTY_DRIVER_MAGIC in __tty_alloc_driver()
+ * @kref: reference counting. Reaching zero frees all the internals and the
+ *	  driver.
+ * @cdevs: allocated/registered character /dev devices
+ * @owner: modules owning this driver. Used drivers cannot be rmmod'ed.
+ *	   Automatically set by tty_alloc_driver().
+ * @driver_name: name of the driver used in /proc/tty
+ * @name: used for constructing /dev node name
+ * @name_base: used as a number base for constructing /dev node name
+ * @major: major /dev device number (zero for autoassignment)
+ * @minor_start: the first minor /dev device number
+ * @num: number of devices allocated
+ * @type: type of tty driver (%TTY_DRIVER_TYPE_)
+ * @subtype: subtype of tty driver (%SYSTEM_TYPE_, %PTY_TYPE_, %SERIAL_TYPE_)
+ * @init_termios: termios to set to each tty initially (e.g. %tty_std_termios)
+ * @flags: tty driver flags (%TTY_DRIVER_)
+ * @proc_entry: proc fs entry, used internally
+ * @other: driver of the linked tty; only used for the PTY driver
+ * @ttys: array of active &struct tty_struct, set by tty_standard_install()
+ * @ports: array of &struct tty_port; can be set during initialization by
+ *	   tty_port_link_device() and similar
+ * @termios: storage for termios at each TTY close for the next open
+ * @driver_state: pointer to driver's arbitrary data
+ * @ops: driver hooks for TTYs. Set them using tty_set_operations(). Use &struct
+ *	 tty_port helpers in them as much as possible.
+ * @tty_drivers: used internally to link tty_drivers together
+ *
+ * The usual handling of &struct tty_driver is to allocate it by
+ * tty_alloc_driver(), set up all the necessary members, and register it by
+ * tty_register_driver(). At last, the driver is torn down by calling
+ * tty_unregister_driver() followed by tty_driver_kref_put().
+ *
+ * The fields required to be set before calling tty_register_driver() include
+ * @driver_name, @name, @type, @subtype, @init_termios, and @ops.
+ */
 struct tty_driver {
-	int	magic;		/* magic number for this structure */
-	struct kref kref;	/* Reference management */
+	int	magic;
+	struct kref kref;
 	struct cdev **cdevs;
 	struct module	*owner;
 	const char	*driver_name;
 	const char	*name;
-	int	name_base;	/* offset of printed name */
-	int	major;		/* major device number */
-	int	minor_start;	/* start of minor device number */
-	unsigned int	num;	/* number of devices allocated */
-	short	type;		/* type of tty driver */
-	short	subtype;	/* subtype of tty driver */
-	struct ktermios init_termios; /* Initial termios */
-	unsigned long	flags;		/* tty driver flags */
-	struct proc_dir_entry *proc_entry; /* /proc fs entry */
-	struct tty_driver *other; /* only used for the PTY driver */
+	int	name_base;
+	int	major;
+	int	minor_start;
+	unsigned int	num;
+	short	type;
+	short	subtype;
+	struct ktermios init_termios;
+	unsigned long	flags;
+	struct proc_dir_entry *proc_entry;
+	struct tty_driver *other;
 
 	/*
 	 * Pointer to the tty data structures
-- 
cgit v1.2.3


From 1fe183091753b1d7f11e70593700c0c0ef268db7 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 26 Nov 2021 09:15:52 +0100
Subject: tty: add kernel-doc for tty_operations

tty_operations structure was already documented in a standalone comment
in the header beginning.

Move it right before the structure and reformat it so it complies to
kernel-doc. That way, we can include it in Documentation/ later in this
series.

Note that we named proc_show's parameters, so that we can reference
them in the text.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211126081611.11001-5-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 398 +++++++++++++++++++++++++++------------------
 1 file changed, 241 insertions(+), 157 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 3622404a678d..5611992ab26a 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -2,248 +2,332 @@
 #ifndef _LINUX_TTY_DRIVER_H
 #define _LINUX_TTY_DRIVER_H
 
-/*
- * This structure defines the interface between the low-level tty
- * driver and the tty routines.  The following routines can be
- * defined; unless noted otherwise, they are optional, and can be
- * filled in with a null pointer.
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/cdev.h>
+#include <linux/termios.h>
+#include <linux/seq_file.h>
+
+struct tty_struct;
+struct tty_driver;
+struct serial_icounter_struct;
+struct serial_struct;
+
+/**
+ * struct tty_operations -- interface between driver and tty
  *
- * struct tty_struct * (*lookup)(struct tty_driver *self, struct file *, int idx)
+ * @lookup: ``struct tty_struct *()(struct tty_driver *self, struct file *,
+ *				    int idx)``
  *
- *	Return the tty device corresponding to idx, NULL if there is not
- *	one currently in use and an ERR_PTR value on error. Called under
- *	tty_mutex (for now!)
+ *	Return the tty device corresponding to @idx, %NULL if there is not
+ *	one currently in use and an %ERR_PTR value on error. Called under
+ *	%tty_mutex (for now!)
  *
- *	Optional method. Default behaviour is to use the ttys array
+ *	Optional method. Default behaviour is to use the @self->ttys array.
  *
- * int (*install)(struct tty_driver *self, struct tty_struct *tty)
+ * @install: ``int ()(struct tty_driver *self, struct tty_struct *tty)``
  *
- *	Install a new tty into the tty driver internal tables. Used in
- *	conjunction with lookup and remove methods.
+ *	Install a new @tty into the @self's internal tables. Used in
+ *	conjunction with @lookup and @remove methods.
  *
- *	Optional method. Default behaviour is to use the ttys array
+ *	Optional method. Default behaviour is to use the @self->ttys array.
  *
- * void (*remove)(struct tty_driver *self, struct tty_struct *tty)
+ * @remove: ``void ()(struct tty_driver *self, struct tty_struct *tty)``
  *
- *	Remove a closed tty from the tty driver internal tables. Used in
- *	conjunction with lookup and remove methods.
+ *	Remove a closed @tty from the @self's internal tables. Used in
+ *	conjunction with @lookup and @remove methods.
  *
- *	Optional method. Default behaviour is to use the ttys array
+ *	Optional method. Default behaviour is to use the @self->ttys array.
  *
- * int  (*open)(struct tty_struct * tty, struct file * filp);
+ * @open: ``int ()(struct tty_struct *tty, struct file *)``
  *
- * 	This routine is called when a particular tty device is opened.
- * 	This routine is mandatory; if this routine is not filled in,
- * 	the attempted open will fail with ENODEV.
+ *	This routine is called when a particular @tty device is opened. This
+ *	routine is mandatory; if this routine is not filled in, the attempted
+ *	open will fail with %ENODEV.
  *
  *	Required method. Called with tty lock held.
  *
- * void (*close)(struct tty_struct * tty, struct file * filp);
+ * @close: ``void ()(struct tty_struct *tty, struct file *)``
  *
- * 	This routine is called when a particular tty device is closed.
- *	Note: called even if the corresponding open() failed.
+ *	This routine is called when a particular @tty device is closed.
+ *
+ *	Remark: called even if the corresponding @open() failed.
  *
  *	Required method. Called with tty lock held.
  *
- * void (*shutdown)(struct tty_struct * tty);
+ * @shutdown: ``void ()(struct tty_struct *tty)``
  *
- * 	This routine is called under the tty lock when a particular tty device
- *	is closed for the last time. It executes before the tty resources
- *	are freed so may execute while another function holds a tty kref.
+ *	This routine is called under the tty lock when a particular @tty device
+ *	is closed for the last time. It executes before the @tty resources
+ *	are freed so may execute while another function holds a @tty kref.
  *
- * void (*cleanup)(struct tty_struct * tty);
+ * @cleanup: ``void ()(struct tty_struct *tty)``
  *
- *	This routine is called asynchronously when a particular tty device
+ *	This routine is called asynchronously when a particular @tty device
  *	is closed for the last time freeing up the resources. This is
  *	actually the second part of shutdown for routines that might sleep.
  *
+ * @write: ``int ()(struct tty_struct *tty, const unsigned char *buf,
+ *		    int count)``
  *
- * int (*write)(struct tty_struct * tty,
- * 		 const unsigned char *buf, int count);
- *
- * 	This routine is called by the kernel to write a series of
- * 	characters to the tty device.  The characters may come from
- * 	user space or kernel space.  This routine will return the
+ *	This routine is called by the kernel to write a series (@count) of
+ *	characters (@buf) to the @tty device. The characters may come from
+ *	user space or kernel space.  This routine will return the
  *	number of characters actually accepted for writing.
  *
  *	Optional: Required for writable devices.
  *
- * int (*put_char)(struct tty_struct *tty, unsigned char ch);
+ * @put_char: ``int ()(struct tty_struct *tty, unsigned char ch)``
  *
- * 	This routine is called by the kernel to write a single
- * 	character to the tty device.  If the kernel uses this routine,
- * 	it must call the flush_chars() routine (if defined) when it is
- * 	done stuffing characters into the driver.  If there is no room
- * 	in the queue, the character is ignored.
+ *	This routine is called by the kernel to write a single character @ch to
+ *	the @tty device. If the kernel uses this routine, it must call the
+ *	@flush_chars() routine (if defined) when it is done stuffing characters
+ *	into the driver. If there is no room in the queue, the character is
+ *	ignored.
  *
- *	Optional: Kernel will use the write method if not provided.
+ *	Optional: Kernel will use the @write method if not provided. Do not
+ *	call this function directly, call tty_put_char().
  *
- *	Note: Do not call this function directly, call tty_put_char
+ * @flush_chars: ``void ()(struct tty_struct *tty)``
  *
- * void (*flush_chars)(struct tty_struct *tty);
+ *	This routine is called by the kernel after it has written a
+ *	series of characters to the tty device using @put_char().
  *
- * 	This routine is called by the kernel after it has written a
- * 	series of characters to the tty device using put_char().  
+ *	Optional. Do not call this function directly, call
+ *	tty_driver_flush_chars().
  *
- *	Optional:
+ * @write_room: ``unsigned int ()(struct tty_struct *tty)``
  *
- *	Note: Do not call this function directly, call tty_driver_flush_chars
- * 
- * unsigned int  (*write_room)(struct tty_struct *tty);
- *
- * 	This routine returns the numbers of characters the tty driver
- * 	will accept for queuing to be written.  This number is subject
- * 	to change as output buffers get emptied, or if the output flow
+ *	This routine returns the numbers of characters the @tty driver
+ *	will accept for queuing to be written.  This number is subject
+ *	to change as output buffers get emptied, or if the output flow
  *	control is acted.
  *
- *	Required if write method is provided else not needed.
+ *	Required if @write method is provided else not needed. Do not call this
+ *	function directly, call tty_write_room()
  *
- *	Note: Do not call this function directly, call tty_write_room
- * 
- * int  (*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg);
+ * @chars_in_buffer: ``unsigned int ()(struct tty_struct *tty)``
  *
- * 	This routine allows the tty driver to implement
- *	device-specific ioctls.  If the ioctl number passed in cmd
- * 	is not recognized by the driver, it should return ENOIOCTLCMD.
+ *	This routine returns the number of characters in the device private
+ *	output queue. Used in tty_wait_until_sent() and for poll()
+ *	implementation.
  *
- *	Optional
+ *	Optional: if not provided, it is assumed there is no queue on the
+ *	device. Do not call this function directly, call tty_chars_in_buffer().
  *
- * long (*compat_ioctl)(struct tty_struct *tty,,
- * 	                unsigned int cmd, unsigned long arg);
+ * @ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd,
+ *		    unsigned long arg)``
  *
- * 	implement ioctl processing for 32 bit process on 64 bit system
+ *	This routine allows the @tty driver to implement device-specific
+ *	ioctls. If the ioctl number passed in @cmd is not recognized by the
+ *	driver, it should return %ENOIOCTLCMD.
  *
- *	Optional
- * 
- * void (*set_termios)(struct tty_struct *tty, struct ktermios * old);
+ *	Optional.
  *
- * 	This routine allows the tty driver to be notified when
- * 	device's termios settings have changed.
+ * @compat_ioctl: ``long ()(struct tty_struct *tty, unsigned int cmd,
+ *			  unsigned long arg)``
  *
- *	Optional: Called under the termios lock
+ *	Implement ioctl processing for 32 bit process on 64 bit system.
  *
+ *	Optional.
  *
- * void (*set_ldisc)(struct tty_struct *tty);
+ * @set_termios: ``void ()(struct tty_struct *tty, struct ktermios *old)``
  *
- * 	This routine allows the tty driver to be notified when the
- * 	device's termios settings have changed.
+ *	This routine allows the @tty driver to be notified when device's
+ *	termios settings have changed.
  *
- *	Optional: Called under BKL (currently)
- * 
- * void (*throttle)(struct tty_struct * tty);
+ *	Optional: Called under the @tty->termios_rwsem.
  *
- * 	This routine notifies the tty driver that input buffers for
- * 	the line discipline are close to full, and it should somehow
- * 	signal that no more characters should be sent to the tty.
+ * @set_ldisc: ``void ()(struct tty_struct *tty)``
  *
- *	Optional: Always invoke via tty_throttle_safe(), called under the
- *	termios lock.
- * 
- * void (*unthrottle)(struct tty_struct * tty);
+ *	This routine allows the @tty driver to be notified when the device's
+ *	line discipline is being changed.
  *
- * 	This routine notifies the tty drivers that it should signals
- * 	that characters can now be sent to the tty without fear of
- * 	overrunning the input buffers of the line disciplines.
- * 
- *	Optional: Always invoke via tty_unthrottle(), called under the
- *	termios lock.
+ *	Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem.
  *
- * void (*stop)(struct tty_struct *tty);
+ * @throttle: ``void ()(struct tty_struct *tty)``
  *
- * 	This routine notifies the tty driver that it should stop
- * 	outputting characters to the tty device.  
+ *	This routine notifies the @tty driver that input buffers for the line
+ *	discipline are close to full, and it should somehow signal that no more
+ *	characters should be sent to the @tty.
  *
- *	Called with ->flow.lock held. Serialized with start() method.
+ *	Optional: Always invoke via tty_throttle_safe(). Called under the
+ *	@tty->termios_rwsem.
  *
- *	Optional:
+ * @unthrottle: ``void ()(struct tty_struct *tty)``
  *
- *	Note: Call stop_tty not this method.
- * 
- * void (*start)(struct tty_struct *tty);
+ *	This routine notifies the @tty driver that it should signal that
+ *	characters can now be sent to the @tty without fear of overrunning the
+ *	input buffers of the line disciplines.
  *
- * 	This routine notifies the tty driver that it resume sending
+ *	Optional. Always invoke via tty_unthrottle(). Called under the
+ *	@tty->termios_rwsem.
+ *
+ * @stop: ``void ()(struct tty_struct *tty)``
+ *
+ *	This routine notifies the @tty driver that it should stop outputting
  *	characters to the tty device.
  *
- *	Called with ->flow.lock held. Serialized with stop() method.
+ *	Called with @tty->flow.lock held. Serialized with @start() method.
  *
- *	Optional:
+ *	Optional. Always invoke via stop_tty().
  *
- *	Note: Call start_tty not this method.
- * 
- * void (*hangup)(struct tty_struct *tty);
+ * @start: ``void ()(struct tty_struct *tty)``
  *
- * 	This routine notifies the tty driver that it should hang up the
- * 	tty device.
+ *	This routine notifies the @tty driver that it resumed sending
+ *	characters to the @tty device.
  *
- *	Optional:
+ *	Called with @tty->flow.lock held. Serialized with stop() method.
  *
- *	Called with tty lock held.
+ *	Optional. Always invoke via start_tty().
  *
- * int (*break_ctl)(struct tty_struct *tty, int state);
+ * @hangup: ``void ()(struct tty_struct *tty)``
  *
- * 	This optional routine requests the tty driver to turn on or
- * 	off BREAK status on the RS-232 port.  If state is -1,
- * 	then the BREAK status should be turned on; if state is 0, then
- * 	BREAK should be turned off.
+ *	This routine notifies the @tty driver that it should hang up the @tty
+ *	device.
  *
- * 	If this routine is implemented, the high-level tty driver will
- * 	handle the following ioctls: TCSBRK, TCSBRKP, TIOCSBRK,
- * 	TIOCCBRK.
+ *	Optional. Called with tty lock held.
  *
- *	If the driver sets TTY_DRIVER_HARDWARE_BREAK then the interface
- *	will also be called with actual times and the hardware is expected
- *	to do the delay work itself. 0 and -1 are still used for on/off.
+ * @break_ctl: ``int ()(struct tty_struct *tty, int state)``
  *
- *	Optional: Required for TCSBRK/BRKP/etc handling.
+ *	This optional routine requests the @tty driver to turn on or off BREAK
+ *	status on the RS-232 port. If @state is -1, then the BREAK status
+ *	should be turned on; if @state is 0, then BREAK should be turned off.
  *
- * void (*wait_until_sent)(struct tty_struct *tty, int timeout);
- * 
- * 	This routine waits until the device has written out all of the
- * 	characters in its transmitter FIFO.
+ *	If this routine is implemented, the high-level tty driver will handle
+ *	the following ioctls: %TCSBRK, %TCSBRKP, %TIOCSBRK, %TIOCCBRK.
+ *
+ *	If the driver sets %TTY_DRIVER_HARDWARE_BREAK in tty_alloc_driver(),
+ *	then the interface will also be called with actual times and the
+ *	hardware is expected to do the delay work itself. 0 and -1 are still
+ *	used for on/off.
+ *
+ *	Optional: Required for %TCSBRK/%BRKP/etc. handling.
+ *
+ * @flush_buffer: ``void ()(struct tty_struct *tty)``
+ *
+ *	This routine discards device private output buffer. Invoked on close,
+ *	hangup, to implement %TCOFLUSH ioctl and similar.
+ *
+ *	Optional: if not provided, it is assumed there is no queue on the
+ *	device. Do not call this function directly, call
+ *	tty_driver_flush_buffer().
+ *
+ * @wait_until_sent: ``void ()(struct tty_struct *tty, int timeout)``
+ *
+ *	This routine waits until the device has written out all of the
+ *	characters in its transmitter FIFO. Or until @timeout (in jiffies) is
+ *	reached.
+ *
+ *	Optional: If not provided, the device is assumed to have no FIFO.
+ *	Usually correct to invoke via tty_wait_until_sent().
+ *
+ * @send_xchar: ``void ()(struct tty_struct *tty, char ch)``
+ *
+ *	This routine is used to send a high-priority XON/XOFF character (@ch)
+ *	to the @tty device.
  *
- *	Optional: If not provided the device is assumed to have no FIFO
+ *	Optional: If not provided, then the @write method is called under
+ *	the @tty->atomic_write_lock to keep it serialized with the ldisc.
  *
- *	Note: Usually correct to call tty_wait_until_sent
+ * @tiocmget: ``int ()(struct tty_struct *tty)``
  *
- * void (*send_xchar)(struct tty_struct *tty, char ch);
+ *	This routine is used to obtain the modem status bits from the @tty
+ *	driver.
  *
- * 	This routine is used to send a high-priority XON/XOFF
- * 	character to the device.
+ *	Optional: If not provided, then %ENOTTY is returned from the %TIOCMGET
+ *	ioctl. Do not call this function directly, call tty_tiocmget().
  *
- *	Optional: If not provided then the write method is called under
- *	the atomic write lock to keep it serialized with the ldisc.
+ * @tiocmset: ``int ()(struct tty_struct *tty,
+ *		       unsigned int set, unsigned int clear)``
  *
- * int (*resize)(struct tty_struct *tty, struct winsize *ws)
+ *	This routine is used to set the modem status bits to the @tty driver.
+ *	First, @clear bits should be cleared, then @set bits set.
  *
- *	Called when a termios request is issued which changes the
- *	requested terminal geometry.
+ *	Optional: If not provided, then %ENOTTY is returned from the %TIOCMSET
+ *	ioctl. Do not call this function directly, call tty_tiocmset().
+ *
+ * @resize: ``int ()(struct tty_struct *tty, struct winsize *ws)``
+ *
+ *	Called when a termios request is issued which changes the requested
+ *	terminal geometry to @ws.
  *
  *	Optional: the default action is to update the termios structure
  *	without error. This is usually the correct behaviour. Drivers should
- *	not force errors here if they are not resizable objects (eg a serial
+ *	not force errors here if they are not resizable objects (e.g. a serial
  *	line). See tty_do_resize() if you need to wrap the standard method
- *	in your own logic - the usual case.
+ *	in your own logic -- the usual case.
+ *
+ * @get_icount: ``int ()(struct tty_struct *tty,
+ *			 struct serial_icounter *icount)``
+ *
+ *	Called when the @tty device receives a %TIOCGICOUNT ioctl. Passed a
+ *	kernel structure @icount to complete.
+ *
+ *	Optional: called only if provided, otherwise %ENOTTY will be returned.
+ *
+ * @get_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)``
+ *
+ *	Called when the @tty device receives a %TIOCGSERIAL ioctl. Passed a
+ *	kernel structure @p (&struct serial_struct) to complete.
+ *
+ *	Optional: called only if provided, otherwise %ENOTTY will be returned.
+ *	Do not call this function directly, call tty_tiocgserial().
+ *
+ * @set_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)``
+ *
+ *	Called when the @tty device receives a %TIOCSSERIAL ioctl. Passed a
+ *	kernel structure @p (&struct serial_struct) to set the values from.
+ *
+ *	Optional: called only if provided, otherwise %ENOTTY will be returned.
+ *	Do not call this function directly, call tty_tiocsserial().
+ *
+ * @show_fdinfo: ``void ()(struct tty_struct *tty, struct seq_file *m)``
+ *
+ *	Called when the @tty device file descriptor receives a fdinfo request
+ *	from VFS (to show in /proc/<pid>/fdinfo/). @m should be filled with
+ *	information.
+ *
+ *	Optional: called only if provided, otherwise nothing is written to @m.
+ *	Do not call this function directly, call tty_show_fdinfo().
+ *
+ * @poll_init: ``int ()(struct tty_driver *driver, int line, char *options)``
+ *
+ *	kgdboc support (Documentation/dev-tools/kgdb.rst). This routine is
+ *	called to initialize the HW for later use by calling @poll_get_char or
+ *	@poll_put_char.
+ *
+ *	Optional: called only if provided, otherwise skipped as a non-polling
+ *	driver.
+ *
+ * @poll_get_char: ``int ()(struct tty_driver *driver, int line)``
+ *
+ *	kgdboc support (see @poll_init). @driver should read a character from a
+ *	tty identified by @line and return it.
+ *
+ *	Optional: called only if @poll_init provided.
  *
- * int (*get_icount)(struct tty_struct *tty, struct serial_icounter *icount);
+ * @poll_put_char: ``void ()(struct tty_driver *driver, int line, char ch)``
  *
- *	Called when the device receives a TIOCGICOUNT ioctl. Passed a kernel
- *	structure to complete. This method is optional and will only be called
- *	if provided (otherwise ENOTTY will be returned).
+ *	kgdboc support (see @poll_init). @driver should write character @ch to
+ *	a tty identified by @line.
+ *
+ *	Optional: called only if @poll_init provided.
+ *
+ * @proc_show: ``int ()(struct seq_file *m, void *driver)``
+ *
+ *	Driver @driver (cast to &struct tty_driver) can show additional info in
+ *	/proc/tty/driver/<driver_name>. It is enough to fill in the information
+ *	into @m.
+ *
+ *	Optional: called only if provided, otherwise no /proc entry created.
+ *
+ * This structure defines the interface between the low-level tty driver and
+ * the tty routines. These routines can be defined. Unless noted otherwise,
+ * they are optional, and can be filled in with a %NULL pointer.
  */
-
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/kref.h>
-#include <linux/list.h>
-#include <linux/cdev.h>
-#include <linux/termios.h>
-#include <linux/seq_file.h>
-
-struct tty_struct;
-struct tty_driver;
-struct serial_icounter_struct;
-struct serial_struct;
-
 struct tty_operations {
 	struct tty_struct * (*lookup)(struct tty_driver *driver,
 			struct file *filp, int idx);
@@ -288,7 +372,7 @@ struct tty_operations {
 	int (*poll_get_char)(struct tty_driver *driver, int line);
 	void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
 #endif
-	int (*proc_show)(struct seq_file *, void *);
+	int (*proc_show)(struct seq_file *m, void *driver);
 } __randomize_layout;
 
 /**
-- 
cgit v1.2.3


From 630bf86d15778fd3e5df17cb6e00839d0f44a707 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 26 Nov 2021 09:15:53 +0100
Subject: tty: add kernel-doc for tty_port_operations

tty_port_operations used to have only comments along its members.
Convert them into proper kernel-doc comments in front of the structure.
And add some more explanation to them where needed.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211126081611.11001-6-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_port.h | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h
index 9091e1c8de4c..d3ea9ed0b98e 100644
--- a/include/linux/tty_port.h
+++ b/include/linux/tty_port.h
@@ -12,21 +12,28 @@ struct tty_driver;
 struct tty_port;
 struct tty_struct;
 
+/**
+ * struct tty_port_operations -- operations on tty_port
+ * @carrier_raised: return 1 if the carrier is raised on @port
+ * @dtr_rts: raise the DTR line if @raise is nonzero, otherwise lower DTR
+ * @shutdown: called when the last close completes or a hangup finishes IFF the
+ *	port was initialized. Do not use to free resources. Turn off the device
+ *	only. Called under the port mutex to serialize against @activate and
+ *	@shutdown.
+ * @activate: called under the port mutex from tty_port_open(), serialized using
+ *	the port mutex. Supposed to turn on the device.
+ *
+ *	FIXME: long term getting the tty argument *out* of this would be good
+ *	for consoles.
+ *
+ * @destruct: called on the final put of a port. Free resources, possibly incl.
+ *	the port itself.
+ */
 struct tty_port_operations {
-	/* Return 1 if the carrier is raised */
 	int (*carrier_raised)(struct tty_port *port);
-	/* Control the DTR line */
 	void (*dtr_rts)(struct tty_port *port, int raise);
-	/* Called when the last close completes or a hangup finishes
-	   IFF the port was initialized. Do not use to free resources. Called
-	   under the port mutex to serialize against activate/shutdowns */
 	void (*shutdown)(struct tty_port *port);
-	/* Called under the port mutex from tty_port_open, serialized using
-	   the port mutex */
-        /* FIXME: long term getting the tty argument *out* of this would be
-           good for consoles */
 	int (*activate)(struct tty_port *port, struct tty_struct *tty);
-	/* Called on the final put of a port */
 	void (*destruct)(struct tty_port *port);
 };
 
-- 
cgit v1.2.3


From 0c6119f9f7dc03a53bd35ca5a77926eef3c33d10 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 26 Nov 2021 09:15:54 +0100
Subject: tty: add kernel-doc for tty_ldisc_ops

tty_ldisc_ops structure was already partially documented in a standalone
comment in the header beginning.

Move it right before the structure and reformat it so it complies to
kernel-doc. That way, we can include it in Documentation/ later in this
series.

And add the documentation for the members where missing too.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211126081611.11001-7-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_ldisc.h | 259 +++++++++++++++++++++++-----------------------
 1 file changed, 130 insertions(+), 129 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 25f07017bbad..e0da0ba02de9 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -4,126 +4,6 @@
 
 struct tty_struct;
 
-/*
- * This structure defines the interface between the tty line discipline
- * implementation and the tty routines.  The following routines can be
- * defined; unless noted otherwise, they are optional, and can be
- * filled in with a null pointer.
- *
- * int	(*open)(struct tty_struct *);
- *
- *	This function is called when the line discipline is associated
- *	with the tty.  The line discipline can use this as an
- *	opportunity to initialize any state needed by the ldisc routines.
- *
- * void	(*close)(struct tty_struct *);
- *
- *	This function is called when the line discipline is being
- *	shutdown, either because the tty is being closed or because
- *	the tty is being changed to use a new line discipline
- *
- * void	(*flush_buffer)(struct tty_struct *tty);
- *
- *	This function instructs the line discipline to clear its
- *	buffers of any input characters it may have queued to be
- *	delivered to the user mode process.
- *
- * ssize_t (*read)(struct tty_struct * tty, struct file * file,
- *		   unsigned char * buf, size_t nr);
- *
- *	This function is called when the user requests to read from
- *	the tty.  The line discipline will return whatever characters
- *	it has buffered up for the user.  If this function is not
- *	defined, the user will receive an EIO error.
- *
- * ssize_t (*write)(struct tty_struct * tty, struct file * file,
- *		    const unsigned char * buf, size_t nr);
- *
- *	This function is called when the user requests to write to the
- *	tty.  The line discipline will deliver the characters to the
- *	low-level tty device for transmission, optionally performing
- *	some processing on the characters first.  If this function is
- *	not defined, the user will receive an EIO error.
- *
- * int	(*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg);
- *
- *	This function is called when the user requests an ioctl which
- *	is not handled by the tty layer or the low-level tty driver.
- *	It is intended for ioctls which affect line discpline
- *	operation.  Note that the search order for ioctls is (1) tty
- *	layer, (2) tty low-level driver, (3) line discpline.  So a
- *	low-level driver can "grab" an ioctl request before the line
- *	discpline has a chance to see it.
- *
- * int	(*compat_ioctl)(struct tty_struct *tty, unsigned int cmd,
- *			unsigned long arg);
- *
- *	Process ioctl calls from 32-bit process on 64-bit system
- *
- *	NOTE: only ioctls that are neither "pointer to compatible
- *	structure" nor tty-generic.  Something private that takes
- *	an integer or a pointer to wordsize-sensitive structure
- *	belongs here, but most of ldiscs will happily leave
- *	it NULL.
- *
- * void	(*set_termios)(struct tty_struct *tty, struct ktermios * old);
- *
- *	This function notifies the line discpline that a change has
- *	been made to the termios structure.
- *
- * int	(*poll)(struct tty_struct * tty, struct file * file,
- *		  poll_table *wait);
- *
- *	This function is called when a user attempts to select/poll on a
- *	tty device.  It is solely the responsibility of the line
- *	discipline to handle poll requests.
- *
- * void	(*receive_buf)(struct tty_struct *, const unsigned char *cp,
- *		       char *fp, int count);
- *
- *	This function is called by the low-level tty driver to send
- *	characters received by the hardware to the line discpline for
- *	processing.  <cp> is a pointer to the buffer of input
- *	character received by the device.  <fp> is a pointer to a
- *	pointer of flag bytes which indicate whether a character was
- *	received with a parity error, etc. <fp> may be NULL to indicate
- *	all data received is TTY_NORMAL.
- *
- * void	(*write_wakeup)(struct tty_struct *);
- *
- *	This function is called by the low-level tty driver to signal
- *	that line discpline should try to send more characters to the
- *	low-level driver for transmission.  If the line discpline does
- *	not have any more data to send, it can just return. If the line
- *	discipline does have some data to send, please arise a tasklet
- *	or workqueue to do the real data transfer. Do not send data in
- *	this hook, it may leads to a deadlock.
- *
- * int (*hangup)(struct tty_struct *)
- *
- *	Called on a hangup. Tells the discipline that it should
- *	cease I/O to the tty driver. Can sleep. The driver should
- *	seek to perform this action quickly but should wait until
- *	any pending driver I/O is completed.
- *
- * void (*dcd_change)(struct tty_struct *tty, unsigned int status)
- *
- *	Tells the discipline that the DCD pin has changed its status.
- *	Used exclusively by the N_PPS (Pulse-Per-Second) line discipline.
- *
- * int	(*receive_buf2)(struct tty_struct *, const unsigned char *cp,
- *			char *fp, int count);
- *
- *	This function is called by the low-level tty driver to send
- *	characters received by the hardware to the line discpline for
- *	processing.  <cp> is a pointer to the buffer of input
- *	character received by the device.  <fp> is a pointer to a
- *	pointer of flag bytes which indicate whether a character was
- *	received with a parity error, etc. <fp> may be NULL to indicate
- *	all data received is TTY_NORMAL.
- *	If assigned, prefer this function for automatic flow control.
- */
-
 #include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/atomic.h>
@@ -175,7 +55,128 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
 		ldsem_down_write(sem, timeout)
 #endif
 
-
+/**
+ * struct tty_ldisc_ops - ldisc operations
+ *
+ * @name: name of this ldisc rendered in /proc/tty/ldiscs
+ * @num: ``N_*`` number (%N_TTY, %N_HDLC, ...) reserved to this ldisc
+ *
+ * @open: ``int ()(struct tty_struct *tty)``
+ *
+ *	This function is called when the line discipline is associated with the
+ *	@tty. The line discipline can use this as an opportunity to initialize
+ *	any state needed by the ldisc routines.
+ *
+ * @close: ``void ()(struct tty_struct *tty)``
+ *
+ *	This function is called when the line discipline is being shutdown,
+ *	either because the @tty is being closed or because the @tty is being
+ *	changed to use a new line discipline
+ *
+ * @flush_buffer: ``void ()(struct tty_struct *tty)``
+ *
+ *	This function instructs the line discipline to clear its buffers of any
+ *	input characters it may have queued to be delivered to the user mode
+ *	process.
+ *
+ * @read: ``ssize_t ()(struct tty_struct *tty, struct file *file,
+ *		unsigned char *buf, size_t nr)``
+ *
+ *	This function is called when the user requests to read from the @tty.
+ *	The line discipline will return whatever characters it has buffered up
+ *	for the user. If this function is not defined, the user will receive
+ *	an %EIO error.
+ *
+ * @write: ``ssize_t ()(struct tty_struct *tty, struct file *file,
+ *		const unsigned char *buf, size_t nr)``
+ *
+ *	This function is called when the user requests to write to the @tty.
+ *	The line discipline will deliver the characters to the low-level tty
+ *	device for transmission, optionally performing some processing on the
+ *	characters first. If this function is not defined, the user will
+ *	receive an %EIO error.
+ *
+ * @ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd,
+ *		unsigned long arg)``
+ *
+ *	This function is called when the user requests an ioctl which is not
+ *	handled by the tty layer or the low-level tty driver. It is intended
+ *	for ioctls which affect line discpline operation.  Note that the search
+ *	order for ioctls is (1) tty layer, (2) tty low-level driver, (3) line
+ *	discpline. So a low-level driver can "grab" an ioctl request before
+ *	the line discpline has a chance to see it.
+ *
+ * @compat_ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd,
+ *		unsigned long arg)``
+ *
+ *	Process ioctl calls from 32-bit process on 64-bit system.
+ *
+ *	Note that only ioctls that are neither "pointer to compatible
+ *	structure" nor tty-generic.  Something private that takes an integer or
+ *	a pointer to wordsize-sensitive structure belongs here, but most of
+ *	ldiscs will happily leave it %NULL.
+ *
+ * @set_termios: ``void ()(struct tty_struct *tty, struct ktermios *old)``
+ *
+ *	This function notifies the line discpline that a change has been made
+ *	to the termios structure.
+ *
+ * @poll: ``int ()(struct tty_struct *tty, struct file *file,
+ *		  struct poll_table_struct *wait)``
+ *
+ *	This function is called when a user attempts to select/poll on a @tty
+ *	device. It is solely the responsibility of the line discipline to
+ *	handle poll requests.
+ *
+ * @hangup: ``void ()(struct tty_struct *tty)``
+ *
+ *	Called on a hangup. Tells the discipline that it should cease I/O to
+ *	the tty driver. Can sleep. The driver should seek to perform this
+ *	action quickly but should wait until any pending driver I/O is
+ *	completed.
+ *
+ * @receive_buf: ``void ()(struct tty_struct *tty, const unsigned char *cp,
+ *		       const char *fp, int count)``
+ *
+ *	This function is called by the low-level tty driver to send characters
+ *	received by the hardware to the line discpline for processing. @cp is
+ *	a pointer to the buffer of input character received by the device. @fp
+ *	is a pointer to an array of flag bytes which indicate whether a
+ *	character was received with a parity error, etc. @fp may be %NULL to
+ *	indicate all data received is %TTY_NORMAL.
+ *
+ * @write_wakeup: ``void ()(struct tty_struct *tty)``
+ *
+ *	This function is called by the low-level tty driver to signal that line
+ *	discpline should try to send more characters to the low-level driver
+ *	for transmission. If the line discpline does not have any more data to
+ *	send, it can just return. If the line discipline does have some data to
+ *	send, please arise a tasklet or workqueue to do the real data transfer.
+ *	Do not send data in this hook, it may lead to a deadlock.
+ *
+ * @dcd_change: ``void ()(struct tty_struct *tty, unsigned int status)``
+ *
+ *	Tells the discipline that the DCD pin has changed its status. Used
+ *	exclusively by the %N_PPS (Pulse-Per-Second) line discipline.
+ *
+ * @receive_buf2: ``int ()(struct tty_struct *tty, const unsigned char *cp,
+ *			const char *fp, int count)``
+ *
+ *	This function is called by the low-level tty driver to send characters
+ *	received by the hardware to the line discpline for processing. @cp is a
+ *	pointer to the buffer of input character received by the device.  @fp
+ *	is a pointer to an array of flag bytes which indicate whether a
+ *	character was received with a parity error, etc. @fp may be %NULL to
+ *	indicate all data received is %TTY_NORMAL. If assigned, prefer this
+ *	function for automatic flow control.
+ *
+ * @owner: module containting this ldisc (for reference counting)
+ *
+ * This structure defines the interface between the tty line discipline
+ * implementation and the tty routines. The above routines can be defined.
+ * Unless noted otherwise, they are optional, and can be filled in with a %NULL
+ * pointer.
+ */
 struct tty_ldisc_ops {
 	char	*name;
 	int	num;
@@ -183,8 +184,8 @@ struct tty_ldisc_ops {
 	/*
 	 * The following routines are called from above.
 	 */
-	int	(*open)(struct tty_struct *);
-	void	(*close)(struct tty_struct *);
+	int	(*open)(struct tty_struct *tty);
+	void	(*close)(struct tty_struct *tty);
 	void	(*flush_buffer)(struct tty_struct *tty);
 	ssize_t	(*read)(struct tty_struct *tty, struct file *file,
 			unsigned char *buf, size_t nr,
@@ -196,18 +197,18 @@ struct tty_ldisc_ops {
 	int	(*compat_ioctl)(struct tty_struct *tty, unsigned int cmd,
 			unsigned long arg);
 	void	(*set_termios)(struct tty_struct *tty, struct ktermios *old);
-	__poll_t (*poll)(struct tty_struct *, struct file *,
-			     struct poll_table_struct *);
+	__poll_t (*poll)(struct tty_struct *tty, struct file *file,
+			     struct poll_table_struct *wait);
 	void	(*hangup)(struct tty_struct *tty);
 
 	/*
 	 * The following routines are called from below.
 	 */
-	void	(*receive_buf)(struct tty_struct *, const unsigned char *cp,
+	void	(*receive_buf)(struct tty_struct *tty, const unsigned char *cp,
 			       const char *fp, int count);
-	void	(*write_wakeup)(struct tty_struct *);
-	void	(*dcd_change)(struct tty_struct *, unsigned int);
-	int	(*receive_buf2)(struct tty_struct *, const unsigned char *cp,
+	void	(*write_wakeup)(struct tty_struct *tty);
+	void	(*dcd_change)(struct tty_struct *tty, unsigned int status);
+	int	(*receive_buf2)(struct tty_struct *tty, const unsigned char *cp,
 				const char *fp, int count);
 
 	struct  module *owner;
-- 
cgit v1.2.3


From 29d5ef685948369602ccd5c04d2a215449c4b943 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 26 Nov 2021 09:15:55 +0100
Subject: tty: combine tty_operations triple docs into kernel-doc

In Documentation/driver-api/serial/tty.rst, there are triplicated texts
about some struct tty_operations' hooks. Combine them into existing
kernel-doc comments of struct tty_operations and drop them from the
Documentation/.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211126081611.11001-8-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 5611992ab26a..41274d551e28 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -47,15 +47,17 @@ struct serial_struct;
  *	routine is mandatory; if this routine is not filled in, the attempted
  *	open will fail with %ENODEV.
  *
- *	Required method. Called with tty lock held.
+ *	Required method. Called with tty lock held. May sleep.
  *
  * @close: ``void ()(struct tty_struct *tty, struct file *)``
  *
- *	This routine is called when a particular @tty device is closed.
+ *	This routine is called when a particular @tty device is closed. At the
+ *	point of return from this call the driver must make no further ldisc
+ *	calls of any kind.
  *
  *	Remark: called even if the corresponding @open() failed.
  *
- *	Required method. Called with tty lock held.
+ *	Required method. Called with tty lock held. May sleep.
  *
  * @shutdown: ``void ()(struct tty_struct *tty)``
  *
@@ -77,7 +79,10 @@ struct serial_struct;
  *	user space or kernel space.  This routine will return the
  *	number of characters actually accepted for writing.
  *
- *	Optional: Required for writable devices.
+ *	May occur in parallel in special cases. Because this includes panic
+ *	paths drivers generally shouldn't try and do clever locking here.
+ *
+ *	Optional: Required for writable devices. May not sleep.
  *
  * @put_char: ``int ()(struct tty_struct *tty, unsigned char ch)``
  *
@@ -105,6 +110,9 @@ struct serial_struct;
  *	to change as output buffers get emptied, or if the output flow
  *	control is acted.
  *
+ *	The ldisc is responsible for being intelligent about multi-threading of
+ *	write_room/write calls
+ *
  *	Required if @write method is provided else not needed. Do not call this
  *	function directly, call tty_write_room()
  *
@@ -136,14 +144,21 @@ struct serial_struct;
  * @set_termios: ``void ()(struct tty_struct *tty, struct ktermios *old)``
  *
  *	This routine allows the @tty driver to be notified when device's
- *	termios settings have changed.
+ *	termios settings have changed. New settings are in @tty->termios.
+ *	Previous settings are passed in the @old argument.
  *
- *	Optional: Called under the @tty->termios_rwsem.
+ *	The API is defined such that the driver should return the actual modes
+ *	selected. This means that the driver is responsible for modifying any
+ *	bits in @tty->termios it cannot fulfill to indicate the actual modes
+ *	being used.
+ *
+ *	Optional. Called under the @tty->termios_rwsem. May sleep.
  *
  * @set_ldisc: ``void ()(struct tty_struct *tty)``
  *
  *	This routine allows the @tty driver to be notified when the device's
- *	line discipline is being changed.
+ *	line discipline is being changed. At the point this is done the
+ *	discipline is not yet usable.
  *
  *	Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem.
  *
@@ -153,6 +168,9 @@ struct serial_struct;
  *	discipline are close to full, and it should somehow signal that no more
  *	characters should be sent to the @tty.
  *
+ *	Serialization including with @unthrottle() is the job of the ldisc
+ *	layer.
+ *
  *	Optional: Always invoke via tty_throttle_safe(). Called under the
  *	@tty->termios_rwsem.
  *
@@ -204,7 +222,7 @@ struct serial_struct;
  *	hardware is expected to do the delay work itself. 0 and -1 are still
  *	used for on/off.
  *
- *	Optional: Required for %TCSBRK/%BRKP/etc. handling.
+ *	Optional: Required for %TCSBRK/%BRKP/etc. handling. May sleep.
  *
  * @flush_buffer: ``void ()(struct tty_struct *tty)``
  *
@@ -222,7 +240,7 @@ struct serial_struct;
  *	reached.
  *
  *	Optional: If not provided, the device is assumed to have no FIFO.
- *	Usually correct to invoke via tty_wait_until_sent().
+ *	Usually correct to invoke via tty_wait_until_sent(). May sleep.
  *
  * @send_xchar: ``void ()(struct tty_struct *tty, char ch)``
  *
-- 
cgit v1.2.3


From 40f4268cddb93d17a11579920d940c2dca8b9445 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 26 Nov 2021 09:15:56 +0100
Subject: tty: combine tty_ldisc_ops docs into kernel-doc

In Documentation/driver-api/serial/tty.rst, there are duplicated texts
about some struct tty_ldisc_ops' hooks. Combine them into existing
kernel-doc comments of struct tty_ldisc_ops and drop them from the
Documentation/.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211126081611.11001-9-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_ldisc.h | 67 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index e0da0ba02de9..e85002b56752 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -61,33 +61,45 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  * @name: name of this ldisc rendered in /proc/tty/ldiscs
  * @num: ``N_*`` number (%N_TTY, %N_HDLC, ...) reserved to this ldisc
  *
- * @open: ``int ()(struct tty_struct *tty)``
+ * @open: [TTY] ``int ()(struct tty_struct *tty)``
  *
  *	This function is called when the line discipline is associated with the
- *	@tty. The line discipline can use this as an opportunity to initialize
- *	any state needed by the ldisc routines.
+ *	@tty. No other call into the line discipline for this tty will occur
+ *	until it completes successfully. It should initialize any state needed
+ *	by the ldisc, and set @tty->receive_room to the maximum amount of data
+ *	the line discipline is willing to accept from the driver with a single
+ *	call to @receive_buf(). Returning an error will prevent the ldisc from
+ *	being attached.
  *
- * @close: ``void ()(struct tty_struct *tty)``
+ *	Can sleep.
+ *
+ * @close: [TTY] ``void ()(struct tty_struct *tty)``
  *
  *	This function is called when the line discipline is being shutdown,
  *	either because the @tty is being closed or because the @tty is being
- *	changed to use a new line discipline
+ *	changed to use a new line discipline. At the point of execution no
+ *	further users will enter the ldisc code for this tty.
+ *
+ *	Can sleep.
  *
- * @flush_buffer: ``void ()(struct tty_struct *tty)``
+ * @flush_buffer: [TTY] ``void ()(struct tty_struct *tty)``
  *
  *	This function instructs the line discipline to clear its buffers of any
  *	input characters it may have queued to be delivered to the user mode
- *	process.
+ *	process. It may be called at any point between open and close.
  *
- * @read: ``ssize_t ()(struct tty_struct *tty, struct file *file,
+ * @read: [TTY] ``ssize_t ()(struct tty_struct *tty, struct file *file,
  *		unsigned char *buf, size_t nr)``
  *
  *	This function is called when the user requests to read from the @tty.
  *	The line discipline will return whatever characters it has buffered up
  *	for the user. If this function is not defined, the user will receive
- *	an %EIO error.
+ *	an %EIO error. Multiple read calls may occur in parallel and the ldisc
+ *	must deal with serialization issues.
+ *
+ *	Can sleep.
  *
- * @write: ``ssize_t ()(struct tty_struct *tty, struct file *file,
+ * @write: [TTY] ``ssize_t ()(struct tty_struct *tty, struct file *file,
  *		const unsigned char *buf, size_t nr)``
  *
  *	This function is called when the user requests to write to the @tty.
@@ -96,7 +108,9 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  *	characters first. If this function is not defined, the user will
  *	receive an %EIO error.
  *
- * @ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd,
+ *	Can sleep.
+ *
+ * @ioctl: [TTY] ``int ()(struct tty_struct *tty, unsigned int cmd,
  *		unsigned long arg)``
  *
  *	This function is called when the user requests an ioctl which is not
@@ -106,7 +120,7 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  *	discpline. So a low-level driver can "grab" an ioctl request before
  *	the line discpline has a chance to see it.
  *
- * @compat_ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd,
+ * @compat_ioctl: [TTY] ``int ()(struct tty_struct *tty, unsigned int cmd,
  *		unsigned long arg)``
  *
  *	Process ioctl calls from 32-bit process on 64-bit system.
@@ -116,27 +130,29 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  *	a pointer to wordsize-sensitive structure belongs here, but most of
  *	ldiscs will happily leave it %NULL.
  *
- * @set_termios: ``void ()(struct tty_struct *tty, struct ktermios *old)``
+ * @set_termios: [TTY] ``void ()(struct tty_struct *tty, struct ktermios *old)``
  *
  *	This function notifies the line discpline that a change has been made
  *	to the termios structure.
  *
- * @poll: ``int ()(struct tty_struct *tty, struct file *file,
+ * @poll: [TTY] ``int ()(struct tty_struct *tty, struct file *file,
  *		  struct poll_table_struct *wait)``
  *
  *	This function is called when a user attempts to select/poll on a @tty
  *	device. It is solely the responsibility of the line discipline to
  *	handle poll requests.
  *
- * @hangup: ``void ()(struct tty_struct *tty)``
+ * @hangup: [TTY] ``void ()(struct tty_struct *tty)``
  *
  *	Called on a hangup. Tells the discipline that it should cease I/O to
- *	the tty driver. Can sleep. The driver should seek to perform this
- *	action quickly but should wait until any pending driver I/O is
- *	completed.
+ *	the tty driver. The driver should seek to perform this action quickly
+ *	but should wait until any pending driver I/O is completed. No further
+ *	calls into the ldisc code will occur.
+ *
+ *	Can sleep.
  *
- * @receive_buf: ``void ()(struct tty_struct *tty, const unsigned char *cp,
- *		       const char *fp, int count)``
+ * @receive_buf: [DRV] ``void ()(struct tty_struct *tty,
+ *		       const unsigned char *cp, const char *fp, int count)``
  *
  *	This function is called by the low-level tty driver to send characters
  *	received by the hardware to the line discpline for processing. @cp is
@@ -145,7 +161,7 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  *	character was received with a parity error, etc. @fp may be %NULL to
  *	indicate all data received is %TTY_NORMAL.
  *
- * @write_wakeup: ``void ()(struct tty_struct *tty)``
+ * @write_wakeup: [DRV] ``void ()(struct tty_struct *tty)``
  *
  *	This function is called by the low-level tty driver to signal that line
  *	discpline should try to send more characters to the low-level driver
@@ -154,13 +170,13 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  *	send, please arise a tasklet or workqueue to do the real data transfer.
  *	Do not send data in this hook, it may lead to a deadlock.
  *
- * @dcd_change: ``void ()(struct tty_struct *tty, unsigned int status)``
+ * @dcd_change: [DRV] ``void ()(struct tty_struct *tty, unsigned int status)``
  *
  *	Tells the discipline that the DCD pin has changed its status. Used
  *	exclusively by the %N_PPS (Pulse-Per-Second) line discipline.
  *
- * @receive_buf2: ``int ()(struct tty_struct *tty, const unsigned char *cp,
- *			const char *fp, int count)``
+ * @receive_buf2: [DRV] ``int ()(struct tty_struct *tty,
+ *			const unsigned char *cp, const char *fp, int count)``
  *
  *	This function is called by the low-level tty driver to send characters
  *	received by the hardware to the line discpline for processing. @cp is a
@@ -176,6 +192,9 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  * implementation and the tty routines. The above routines can be defined.
  * Unless noted otherwise, they are optional, and can be filled in with a %NULL
  * pointer.
+ *
+ * Hooks marked [TTY] are invoked from the TTY core, the [DRV] ones from the
+ * tty_driver side.
  */
 struct tty_ldisc_ops {
 	char	*name;
-- 
cgit v1.2.3


From 4072254f96f954ec0d34899f15d987803b6d76a2 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 26 Nov 2021 09:15:57 +0100
Subject: tty: reformat tty_struct::flags into kernel-doc

Move the partial tty_struct::flags documentation from tty_ldisc to the
tty.h header and combine it with the one-liners present there. Convert
all those to kernel-doc. This way, we can simply reference the
documentation in Documentation while the text is still along the
definitions.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211126081611.11001-10-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 74 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 60 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index da49ad9be281..7b0a5d478ef6 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -263,26 +263,72 @@ struct tty_file_private {
 /* tty magic number */
 #define TTY_MAGIC		0x5401
 
-/*
- * These bits are used in the flags field of the tty structure.
+/**
+ * DOC: TTY Struct Flags
+ *
+ * These bits are used in the :c:member:`tty_struct.flags` field.
  *
  * So that interrupts won't be able to mess up the queues,
  * copy_to_cooked must be atomic with respect to itself, as must
  * tty->write.  Thus, you must use the inline functions set_bit() and
  * clear_bit() to make things atomic.
+ *
+ * TTY_THROTTLED
+ *	Driver input is throttled. The ldisc should call
+ *	:c:member:`tty_driver.unthrottle()` in order to resume reception when
+ *	it is ready to process more data (at threshold min).
+ *
+ * TTY_IO_ERROR
+ *	If set, causes all subsequent userspace read/write calls on the tty to
+ *	fail, returning -%EIO. (May be no ldisc too.)
+ *
+ * TTY_OTHER_CLOSED
+ *	Device is a pty and the other side has closed.
+ *
+ * TTY_EXCLUSIVE
+ *	Exclusive open mode (a single opener).
+ *
+ * TTY_DO_WRITE_WAKEUP
+ *	If set, causes the driver to call the
+ *	:c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume
+ *	transmission when it can accept more data to transmit.
+ *
+ * TTY_LDISC_OPEN
+ *	Indicates that a line discipline is open. For debugging purposes only.
+ *
+ * TTY_PTY_LOCK
+ *	A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic.
+ *
+ * TTY_NO_WRITE_SPLIT
+ *	Prevent driver from splitting up writes into smaller chunks (preserve
+ *	write boundaries to driver).
+ *
+ * TTY_HUPPED
+ *	The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`.
+ *
+ * TTY_HUPPING
+ *	The TTY is in the process of hanging up to abort potential readers.
+ *
+ * TTY_LDISC_CHANGING
+ *	Line discipline for this TTY is being changed. I/O should not block
+ *	when this is set. Use tty_io_nonblock() to check.
+ *
+ * TTY_LDISC_HALTED
+ *	Line discipline for this TTY was stopped. No work should be queued to
+ *	this ldisc.
  */
-#define TTY_THROTTLED 		0	/* Call unthrottle() at threshold min */
-#define TTY_IO_ERROR 		1	/* Cause an I/O error (may be no ldisc too) */
-#define TTY_OTHER_CLOSED 	2	/* Other side (if any) has closed */
-#define TTY_EXCLUSIVE 		3	/* Exclusive open mode */
-#define TTY_DO_WRITE_WAKEUP 	5	/* Call write_wakeup after queuing new */
-#define TTY_LDISC_OPEN	 	11	/* Line discipline is open */
-#define TTY_PTY_LOCK 		16	/* pty private */
-#define TTY_NO_WRITE_SPLIT 	17	/* Preserve write boundaries to driver */
-#define TTY_HUPPED 		18	/* Post driver->hangup() */
-#define TTY_HUPPING		19	/* Hangup in progress */
-#define TTY_LDISC_CHANGING	20	/* Change pending - non-block IO */
-#define TTY_LDISC_HALTED	22	/* Line discipline is halted */
+#define TTY_THROTTLED		0
+#define TTY_IO_ERROR		1
+#define TTY_OTHER_CLOSED	2
+#define TTY_EXCLUSIVE		3
+#define TTY_DO_WRITE_WAKEUP	5
+#define TTY_LDISC_OPEN		11
+#define TTY_PTY_LOCK		16
+#define TTY_NO_WRITE_SPLIT	17
+#define TTY_HUPPED		18
+#define TTY_HUPPING		19
+#define TTY_LDISC_CHANGING	20
+#define TTY_LDISC_HALTED	22
 
 static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file)
 {
-- 
cgit v1.2.3


From 34d809f8b4ff68f63e8d7f71d93d150382c6bb8b Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 26 Nov 2021 09:15:58 +0100
Subject: tty: reformat TTY_DRIVER_ flags into kernel-doc

We want to reference TTY_DRIVER_* flags in Documentation/ later in this
series. But the current documentation in the TTY_DRIVER_*'s header does
not allow that. Reformat it to kernel-doc using "DOC" directive and
line-feeds, so that we can include it as it is.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20211126081611.11001-11-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 90 ++++++++++++++++++++++++----------------------
 1 file changed, 47 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 41274d551e28..4841d8069c07 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -492,49 +492,53 @@ static inline void tty_set_operations(struct tty_driver *driver,
 /* tty driver magic number */
 #define TTY_DRIVER_MAGIC		0x5402
 
-/*
- * tty driver flags
- * 
- * TTY_DRIVER_RESET_TERMIOS --- requests the tty layer to reset the
- * 	termios setting when the last process has closed the device.
- * 	Used for PTY's, in particular.
- * 
- * TTY_DRIVER_REAL_RAW --- if set, indicates that the driver will
- * 	guarantee never to set any special character handling
- * 	flags if ((IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR ||
- * 	!INPCK)).  That is, if there is no reason for the driver to
- * 	send notifications of parity and break characters up to the
- * 	line driver, it won't do so.  This allows the line driver to
- *	optimize for this case if this flag is set.  (Note that there
- * 	is also a promise, if the above case is true, not to signal
- * 	overruns, either.)
- *
- * TTY_DRIVER_DYNAMIC_DEV --- if set, the individual tty devices need
- *	to be registered with a call to tty_register_device() when the
- *	device is found in the system and unregistered with a call to
- *	tty_unregister_device() so the devices will be show up
- *	properly in sysfs.  If not set, driver->num entries will be
- *	created by the tty core in sysfs when tty_register_driver() is
- *	called.  This is to be used by drivers that have tty devices
- *	that can appear and disappear while the main tty driver is
- *	registered with the tty core.
- *
- * TTY_DRIVER_DEVPTS_MEM -- don't use the standard arrays, instead
- *	use dynamic memory keyed through the devpts filesystem.  This
- *	is only applicable to the pty driver.
- *
- * TTY_DRIVER_HARDWARE_BREAK -- hardware handles break signals. Pass
- *	the requested timeout to the caller instead of using a simple
- *	on/off interface.
- *
- * TTY_DRIVER_DYNAMIC_ALLOC -- do not allocate structures which are
- *	needed per line for this driver as it would waste memory.
- *	The driver will take care.
- *
- * TTY_DRIVER_UNNUMBERED_NODE -- do not create numbered /dev nodes. In
- *	other words create /dev/ttyprintk and not /dev/ttyprintk0.
- *	Applicable only when a driver for a single tty device is
- *	being allocated.
+/**
+ * DOC: TTY Driver Flags
+ *
+ * TTY_DRIVER_RESET_TERMIOS
+ *	Requests the tty layer to reset the termios setting when the last
+ *	process has closed the device. Used for PTYs, in particular.
+ *
+ * TTY_DRIVER_REAL_RAW
+ *	Indicates that the driver will guarantee not to set any special
+ *	character handling flags if this is set for the tty:
+ *
+ *	``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)``
+ *
+ *	That is, if there is no reason for the driver to
+ *	send notifications of parity and break characters up to the line
+ *	driver, it won't do so.  This allows the line driver to optimize for
+ *	this case if this flag is set.  (Note that there is also a promise, if
+ *	the above case is true, not to signal overruns, either.)
+ *
+ * TTY_DRIVER_DYNAMIC_DEV
+ *	The individual tty devices need to be registered with a call to
+ *	tty_register_device() when the device is found in the system and
+ *	unregistered with a call to tty_unregister_device() so the devices will
+ *	be show up properly in sysfs.  If not set, all &tty_driver.num entries
+ *	will be created by the tty core in sysfs when tty_register_driver() is
+ *	called.  This is to be used by drivers that have tty devices that can
+ *	appear and disappear while the main tty driver is registered with the
+ *	tty core.
+ *
+ * TTY_DRIVER_DEVPTS_MEM
+ *	Don't use the standard arrays (&tty_driver.ttys and
+ *	&tty_driver.termios), instead use dynamic memory keyed through the
+ *	devpts filesystem. This is only applicable to the PTY driver.
+ *
+ * TTY_DRIVER_HARDWARE_BREAK
+ *	Hardware handles break signals. Pass the requested timeout to the
+ *	&tty_operations.break_ctl instead of using a simple on/off interface.
+ *
+ * TTY_DRIVER_DYNAMIC_ALLOC
+ *	Do not allocate structures which are needed per line for this driver
+ *	(&tty_driver.ports) as it would waste memory. The driver will take
+ *	care. This is only applicable to the PTY driver.
+ *
+ * TTY_DRIVER_UNNUMBERED_NODE
+ *	Do not create numbered ``/dev`` nodes. For example, create
+ *	``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a
+ *	driver for a single tty device is being allocated.
  */
 #define TTY_DRIVER_INSTALLED		0x0001
 #define TTY_DRIVER_RESET_TERMIOS	0x0002
-- 
cgit v1.2.3


From df0e68c1e9945e2ee86d266ce45597bbd8299b06 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 17 Nov 2021 12:05:59 +0000
Subject: comedi: Move the main COMEDI headers

Move the main COMEDI driver headers out of "drivers/comedi/" into new
directory "include/linux/comedi/".  These are "comedidev.h",
"comedilib.h", "comedi_pci.h", "comedi_pcmcia.h", and "comedi_usb.h".
Additionally, move the user-space API header "comedi.h" into
"include/uapi/linux/" and add "WITH Linux-syscall-note" to its
SPDX-License-Identifier.

Update the "COMEDI DRIVERS" section of the MAINTAINERS file to account
for these changes.

Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20211117120604.117740-2-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/comedi/comedi_pci.h    |   56 ++
 include/linux/comedi/comedi_pcmcia.h |   48 ++
 include/linux/comedi/comedi_usb.h    |   41 ++
 include/linux/comedi/comedidev.h     | 1053 ++++++++++++++++++++++++++++++++++
 include/linux/comedi/comedilib.h     |   26 +
 5 files changed, 1224 insertions(+)
 create mode 100644 include/linux/comedi/comedi_pci.h
 create mode 100644 include/linux/comedi/comedi_pcmcia.h
 create mode 100644 include/linux/comedi/comedi_usb.h
 create mode 100644 include/linux/comedi/comedidev.h
 create mode 100644 include/linux/comedi/comedilib.h

(limited to 'include/linux')

diff --git a/include/linux/comedi/comedi_pci.h b/include/linux/comedi/comedi_pci.h
new file mode 100644
index 000000000000..2fb50663e3ed
--- /dev/null
+++ b/include/linux/comedi/comedi_pci.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * comedi_pci.h
+ * header file for Comedi PCI drivers
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 1997-2000 David A. Schleef <ds@schleef.org>
+ */
+
+#ifndef _COMEDI_PCI_H
+#define _COMEDI_PCI_H
+
+#include <linux/pci.h>
+#include <linux/comedi/comedidev.h>
+
+/*
+ * PCI Vendor IDs not in <linux/pci_ids.h>
+ */
+#define PCI_VENDOR_ID_KOLTER		0x1001
+#define PCI_VENDOR_ID_ICP		0x104c
+#define PCI_VENDOR_ID_DT		0x1116
+#define PCI_VENDOR_ID_IOTECH		0x1616
+#define PCI_VENDOR_ID_CONTEC		0x1221
+#define PCI_VENDOR_ID_RTD		0x1435
+#define PCI_VENDOR_ID_HUMUSOFT		0x186c
+
+struct pci_dev *comedi_to_pci_dev(struct comedi_device *dev);
+
+int comedi_pci_enable(struct comedi_device *dev);
+void comedi_pci_disable(struct comedi_device *dev);
+void comedi_pci_detach(struct comedi_device *dev);
+
+int comedi_pci_auto_config(struct pci_dev *pcidev, struct comedi_driver *driver,
+			   unsigned long context);
+void comedi_pci_auto_unconfig(struct pci_dev *pcidev);
+
+int comedi_pci_driver_register(struct comedi_driver *comedi_driver,
+			       struct pci_driver *pci_driver);
+void comedi_pci_driver_unregister(struct comedi_driver *comedi_driver,
+				  struct pci_driver *pci_driver);
+
+/**
+ * module_comedi_pci_driver() - Helper macro for registering a comedi PCI driver
+ * @__comedi_driver: comedi_driver struct
+ * @__pci_driver: pci_driver struct
+ *
+ * Helper macro for comedi PCI drivers which do not do anything special
+ * in module init/exit. This eliminates a lot of boilerplate. Each
+ * module may only use this macro once, and calling it replaces
+ * module_init() and module_exit()
+ */
+#define module_comedi_pci_driver(__comedi_driver, __pci_driver) \
+	module_driver(__comedi_driver, comedi_pci_driver_register, \
+			comedi_pci_driver_unregister, &(__pci_driver))
+
+#endif /* _COMEDI_PCI_H */
diff --git a/include/linux/comedi/comedi_pcmcia.h b/include/linux/comedi/comedi_pcmcia.h
new file mode 100644
index 000000000000..a33dfb65b869
--- /dev/null
+++ b/include/linux/comedi/comedi_pcmcia.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * comedi_pcmcia.h
+ * header file for Comedi PCMCIA drivers
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 1997-2000 David A. Schleef <ds@schleef.org>
+ */
+
+#ifndef _COMEDI_PCMCIA_H
+#define _COMEDI_PCMCIA_H
+
+#include <pcmcia/cistpl.h>
+#include <pcmcia/ds.h>
+#include <linux/comedi/comedidev.h>
+
+struct pcmcia_device *comedi_to_pcmcia_dev(struct comedi_device *dev);
+
+int comedi_pcmcia_enable(struct comedi_device *dev,
+			 int (*conf_check)(struct pcmcia_device *p_dev,
+					   void *priv_data));
+void comedi_pcmcia_disable(struct comedi_device *dev);
+
+int comedi_pcmcia_auto_config(struct pcmcia_device *link,
+			      struct comedi_driver *driver);
+void comedi_pcmcia_auto_unconfig(struct pcmcia_device *link);
+
+int comedi_pcmcia_driver_register(struct comedi_driver *comedi_driver,
+				  struct pcmcia_driver *pcmcia_driver);
+void comedi_pcmcia_driver_unregister(struct comedi_driver *comedi_driver,
+				     struct pcmcia_driver *pcmcia_driver);
+
+/**
+ * module_comedi_pcmcia_driver() - Helper macro for registering a comedi
+ * PCMCIA driver
+ * @__comedi_driver: comedi_driver struct
+ * @__pcmcia_driver: pcmcia_driver struct
+ *
+ * Helper macro for comedi PCMCIA drivers which do not do anything special
+ * in module init/exit. This eliminates a lot of boilerplate. Each
+ * module may only use this macro once, and calling it replaces
+ * module_init() and module_exit()
+ */
+#define module_comedi_pcmcia_driver(__comedi_driver, __pcmcia_driver) \
+	module_driver(__comedi_driver, comedi_pcmcia_driver_register, \
+			comedi_pcmcia_driver_unregister, &(__pcmcia_driver))
+
+#endif /* _COMEDI_PCMCIA_H */
diff --git a/include/linux/comedi/comedi_usb.h b/include/linux/comedi/comedi_usb.h
new file mode 100644
index 000000000000..5d17dd425bd2
--- /dev/null
+++ b/include/linux/comedi/comedi_usb.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/* comedi_usb.h
+ * header file for USB Comedi drivers
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 1997-2000 David A. Schleef <ds@schleef.org>
+ */
+
+#ifndef _COMEDI_USB_H
+#define _COMEDI_USB_H
+
+#include <linux/usb.h>
+#include <linux/comedi/comedidev.h>
+
+struct usb_interface *comedi_to_usb_interface(struct comedi_device *dev);
+struct usb_device *comedi_to_usb_dev(struct comedi_device *dev);
+
+int comedi_usb_auto_config(struct usb_interface *intf,
+			   struct comedi_driver *driver, unsigned long context);
+void comedi_usb_auto_unconfig(struct usb_interface *intf);
+
+int comedi_usb_driver_register(struct comedi_driver *comedi_driver,
+			       struct usb_driver *usb_driver);
+void comedi_usb_driver_unregister(struct comedi_driver *comedi_driver,
+				  struct usb_driver *usb_driver);
+
+/**
+ * module_comedi_usb_driver() - Helper macro for registering a comedi USB driver
+ * @__comedi_driver: comedi_driver struct
+ * @__usb_driver: usb_driver struct
+ *
+ * Helper macro for comedi USB drivers which do not do anything special
+ * in module init/exit. This eliminates a lot of boilerplate. Each
+ * module may only use this macro once, and calling it replaces
+ * module_init() and module_exit()
+ */
+#define module_comedi_usb_driver(__comedi_driver, __usb_driver) \
+	module_driver(__comedi_driver, comedi_usb_driver_register, \
+			comedi_usb_driver_unregister, &(__usb_driver))
+
+#endif /* _COMEDI_USB_H */
diff --git a/include/linux/comedi/comedidev.h b/include/linux/comedi/comedidev.h
new file mode 100644
index 000000000000..0a1150900ef3
--- /dev/null
+++ b/include/linux/comedi/comedidev.h
@@ -0,0 +1,1053 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * comedidev.h
+ * header file for kernel-only structures, variables, and constants
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 1997-2000 David A. Schleef <ds@schleef.org>
+ */
+
+#ifndef _COMEDIDEV_H
+#define _COMEDIDEV_H
+
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/spinlock_types.h>
+#include <linux/rwsem.h>
+#include <linux/kref.h>
+#include <linux/comedi.h>
+
+#define COMEDI_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + (c))
+#define COMEDI_VERSION_CODE COMEDI_VERSION(COMEDI_MAJORVERSION, \
+	COMEDI_MINORVERSION, COMEDI_MICROVERSION)
+#define COMEDI_RELEASE VERSION
+
+#define COMEDI_NUM_BOARD_MINORS 0x30
+
+/**
+ * struct comedi_subdevice - Working data for a COMEDI subdevice
+ * @device: COMEDI device to which this subdevice belongs.  (Initialized by
+ *	comedi_alloc_subdevices().)
+ * @index: Index of this subdevice within device's array of subdevices.
+ *	(Initialized by comedi_alloc_subdevices().)
+ * @type: Type of subdevice from &enum comedi_subdevice_type.  (Initialized by
+ *	the low-level driver.)
+ * @n_chan: Number of channels the subdevice supports.  (Initialized by the
+ *	low-level driver.)
+ * @subdev_flags: Various "SDF" flags indicating aspects of the subdevice to
+ *	the COMEDI core and user application.  (Initialized by the low-level
+ *	driver.)
+ * @len_chanlist: Maximum length of a channel list if the subdevice supports
+ *	asynchronous acquisition commands.  (Optionally initialized by the
+ *	low-level driver, or changed from 0 to 1 during post-configuration.)
+ * @private: Private data pointer which is either set by the low-level driver
+ *	itself, or by a call to comedi_alloc_spriv() which allocates storage.
+ *	In the latter case, the storage is automatically freed after the
+ *	low-level driver's "detach" handler is called for the device.
+ *	(Initialized by the low-level driver.)
+ * @async: Pointer to &struct comedi_async id the subdevice supports
+ *	asynchronous acquisition commands.  (Allocated and initialized during
+ *	post-configuration if needed.)
+ * @lock: Pointer to a file object that performed a %COMEDI_LOCK ioctl on the
+ *	subdevice.  (Initially NULL.)
+ * @busy: Pointer to a file object that is performing an asynchronous
+ *	acquisition command on the subdevice.  (Initially NULL.)
+ * @runflags: Internal flags for use by COMEDI core, mostly indicating whether
+ *	an asynchronous acquisition command is running.
+ * @spin_lock: Generic spin-lock for use by the COMEDI core and the low-level
+ *	driver.  (Initialized by comedi_alloc_subdevices().)
+ * @io_bits: Bit-mask indicating the channel directions for a DIO subdevice
+ *	with no more than 32 channels.  A '1' at a bit position indicates the
+ *	corresponding channel is configured as an output.  (Initialized by the
+ *	low-level driver for a DIO subdevice.  Forced to all-outputs during
+ *	post-configuration for a digital output subdevice.)
+ * @maxdata: If non-zero, this is the maximum raw data value of each channel.
+ *	If zero, the maximum data value is channel-specific.  (Initialized by
+ *	the low-level driver.)
+ * @maxdata_list: If the maximum data value is channel-specific, this points
+ *	to an array of maximum data values indexed by channel index.
+ *	(Initialized by the low-level driver.)
+ * @range_table: If non-NULL, this points to a COMEDI range table for the
+ *	subdevice.  If NULL, the range table is channel-specific.  (Initialized
+ *	by the low-level driver, will be set to an "invalid" range table during
+ *	post-configuration if @range_table and @range_table_list are both
+ *	NULL.)
+ * @range_table_list: If the COMEDI range table is channel-specific, this
+ *	points to an array of pointers to COMEDI range tables indexed by
+ *	channel number.  (Initialized by the low-level driver.)
+ * @chanlist: Not used.
+ * @insn_read: Optional pointer to a handler for the %INSN_READ instruction.
+ *	(Initialized by the low-level driver, or set to a default handler
+ *	during post-configuration.)
+ * @insn_write: Optional pointer to a handler for the %INSN_WRITE instruction.
+ *	(Initialized by the low-level driver, or set to a default handler
+ *	during post-configuration.)
+ * @insn_bits: Optional pointer to a handler for the %INSN_BITS instruction
+ *	for a digital input, digital output or digital input/output subdevice.
+ *	(Initialized by the low-level driver, or set to a default handler
+ *	during post-configuration.)
+ * @insn_config: Optional pointer to a handler for the %INSN_CONFIG
+ *	instruction.  (Initialized by the low-level driver, or set to a default
+ *	handler during post-configuration.)
+ * @do_cmd: If the subdevice supports asynchronous acquisition commands, this
+ *	points to a handler to set it up in hardware.  (Initialized by the
+ *	low-level driver.)
+ * @do_cmdtest: If the subdevice supports asynchronous acquisition commands,
+ *	this points to a handler used to check and possibly tweak a prospective
+ *	acquisition command without setting it up in hardware.  (Initialized by
+ *	the low-level driver.)
+ * @poll: If the subdevice supports asynchronous acquisition commands, this
+ *	is an optional pointer to a handler for the %COMEDI_POLL ioctl which
+ *	instructs the low-level driver to synchronize buffers.  (Initialized by
+ *	the low-level driver if needed.)
+ * @cancel: If the subdevice supports asynchronous acquisition commands, this
+ *	points to a handler used to terminate a running command.  (Initialized
+ *	by the low-level driver.)
+ * @buf_change: If the subdevice supports asynchronous acquisition commands,
+ *	this is an optional pointer to a handler that is called when the data
+ *	buffer for handling asynchronous commands is allocated or reallocated.
+ *	(Initialized by the low-level driver if needed.)
+ * @munge: If the subdevice supports asynchronous acquisition commands and
+ *	uses DMA to transfer data from the hardware to the acquisition buffer,
+ *	this points to a function used to "munge" the data values from the
+ *	hardware into the format expected by COMEDI.  (Initialized by the
+ *	low-level driver if needed.)
+ * @async_dma_dir: If the subdevice supports asynchronous acquisition commands
+ *	and uses DMA to transfer data from the hardware to the acquisition
+ *	buffer, this sets the DMA direction for the buffer. (initialized to
+ *	%DMA_NONE by comedi_alloc_subdevices() and changed by the low-level
+ *	driver if necessary.)
+ * @state: Handy bit-mask indicating the output states for a DIO or digital
+ *	output subdevice with no more than 32 channels. (Initialized by the
+ *	low-level driver.)
+ * @class_dev: If the subdevice supports asynchronous acquisition commands,
+ *	this points to a sysfs comediX_subdY device where X is the minor device
+ *	number of the COMEDI device and Y is the subdevice number.  The minor
+ *	device number for the sysfs device is allocated dynamically in the
+ *	range 48 to 255.  This is used to allow the COMEDI device to be opened
+ *	with a different default read or write subdevice.  (Allocated during
+ *	post-configuration if needed.)
+ * @minor: If @class_dev is set, this is its dynamically allocated minor
+ *	device number.  (Set during post-configuration if necessary.)
+ * @readback: Optional pointer to memory allocated by
+ *	comedi_alloc_subdev_readback() used to hold the values written to
+ *	analog output channels so they can be read back.  The storage is
+ *	automatically freed after the low-level driver's "detach" handler is
+ *	called for the device.  (Initialized by the low-level driver.)
+ *
+ * This is the main control structure for a COMEDI subdevice.  If the subdevice
+ * supports asynchronous acquisition commands, additional information is stored
+ * in the &struct comedi_async pointed to by @async.
+ *
+ * Most of the subdevice is initialized by the low-level driver's "attach" or
+ * "auto_attach" handlers but parts of it are initialized by
+ * comedi_alloc_subdevices(), and other parts are initialized during
+ * post-configuration on return from that handler.
+ *
+ * A low-level driver that sets @insn_bits for a digital input, digital output,
+ * or DIO subdevice may leave @insn_read and @insn_write uninitialized, in
+ * which case they will be set to a default handler during post-configuration
+ * that uses @insn_bits to emulate the %INSN_READ and %INSN_WRITE instructions.
+ */
+struct comedi_subdevice {
+	struct comedi_device *device;
+	int index;
+	int type;
+	int n_chan;
+	int subdev_flags;
+	int len_chanlist;	/* maximum length of channel/gain list */
+
+	void *private;
+
+	struct comedi_async *async;
+
+	void *lock;
+	void *busy;
+	unsigned int runflags;
+	spinlock_t spin_lock;	/* generic spin-lock for COMEDI and drivers */
+
+	unsigned int io_bits;
+
+	unsigned int maxdata;	/* if maxdata==0, use list */
+	const unsigned int *maxdata_list;	/* list is channel specific */
+
+	const struct comedi_lrange *range_table;
+	const struct comedi_lrange *const *range_table_list;
+
+	unsigned int *chanlist;	/* driver-owned chanlist (not used) */
+
+	int (*insn_read)(struct comedi_device *dev, struct comedi_subdevice *s,
+			 struct comedi_insn *insn, unsigned int *data);
+	int (*insn_write)(struct comedi_device *dev, struct comedi_subdevice *s,
+			  struct comedi_insn *insn, unsigned int *data);
+	int (*insn_bits)(struct comedi_device *dev, struct comedi_subdevice *s,
+			 struct comedi_insn *insn, unsigned int *data);
+	int (*insn_config)(struct comedi_device *dev,
+			   struct comedi_subdevice *s,
+			   struct comedi_insn *insn,
+			   unsigned int *data);
+
+	int (*do_cmd)(struct comedi_device *dev, struct comedi_subdevice *s);
+	int (*do_cmdtest)(struct comedi_device *dev,
+			  struct comedi_subdevice *s,
+			  struct comedi_cmd *cmd);
+	int (*poll)(struct comedi_device *dev, struct comedi_subdevice *s);
+	int (*cancel)(struct comedi_device *dev, struct comedi_subdevice *s);
+
+	/* called when the buffer changes */
+	int (*buf_change)(struct comedi_device *dev,
+			  struct comedi_subdevice *s);
+
+	void (*munge)(struct comedi_device *dev, struct comedi_subdevice *s,
+		      void *data, unsigned int num_bytes,
+		      unsigned int start_chan_index);
+	enum dma_data_direction async_dma_dir;
+
+	unsigned int state;
+
+	struct device *class_dev;
+	int minor;
+
+	unsigned int *readback;
+};
+
+/**
+ * struct comedi_buf_page - Describe a page of a COMEDI buffer
+ * @virt_addr: Kernel address of page.
+ * @dma_addr: DMA address of page if in DMA coherent memory.
+ */
+struct comedi_buf_page {
+	void *virt_addr;
+	dma_addr_t dma_addr;
+};
+
+/**
+ * struct comedi_buf_map - Describe pages in a COMEDI buffer
+ * @dma_hw_dev: Low-level hardware &struct device pointer copied from the
+ *	COMEDI device's hw_dev member.
+ * @page_list: Pointer to array of &struct comedi_buf_page, one for each
+ *	page in the buffer.
+ * @n_pages: Number of pages in the buffer.
+ * @dma_dir: DMA direction used to allocate pages of DMA coherent memory,
+ *	or %DMA_NONE if pages allocated from regular memory.
+ * @refcount: &struct kref reference counter used to free the buffer.
+ *
+ * A COMEDI data buffer is allocated as individual pages, either in
+ * conventional memory or DMA coherent memory, depending on the attached,
+ * low-level hardware device.  (The buffer pages also get mapped into the
+ * kernel's contiguous virtual address space pointed to by the 'prealloc_buf'
+ * member of &struct comedi_async.)
+ *
+ * The buffer is normally freed when the COMEDI device is detached from the
+ * low-level driver (which may happen due to device removal), but if it happens
+ * to be mmapped at the time, the pages cannot be freed until the buffer has
+ * been munmapped.  That is what the reference counter is for.  (The virtual
+ * address space pointed by 'prealloc_buf' is freed when the COMEDI device is
+ * detached.)
+ */
+struct comedi_buf_map {
+	struct device *dma_hw_dev;
+	struct comedi_buf_page *page_list;
+	unsigned int n_pages;
+	enum dma_data_direction dma_dir;
+	struct kref refcount;
+};
+
+/**
+ * struct comedi_async - Control data for asynchronous COMEDI commands
+ * @prealloc_buf: Kernel virtual address of allocated acquisition buffer.
+ * @prealloc_bufsz: Buffer size (in bytes).
+ * @buf_map: Map of buffer pages.
+ * @max_bufsize: Maximum allowed buffer size (in bytes).
+ * @buf_write_count: "Write completed" count (in bytes, modulo 2**32).
+ * @buf_write_alloc_count: "Allocated for writing" count (in bytes,
+ *	modulo 2**32).
+ * @buf_read_count: "Read completed" count (in bytes, modulo 2**32).
+ * @buf_read_alloc_count: "Allocated for reading" count (in bytes,
+ *	modulo 2**32).
+ * @buf_write_ptr: Buffer position for writer.
+ * @buf_read_ptr: Buffer position for reader.
+ * @cur_chan: Current position in chanlist for scan (for those drivers that
+ *	use it).
+ * @scans_done: The number of scans completed.
+ * @scan_progress: Amount received or sent for current scan (in bytes).
+ * @munge_chan: Current position in chanlist for "munging".
+ * @munge_count: "Munge" count (in bytes, modulo 2**32).
+ * @munge_ptr: Buffer position for "munging".
+ * @events: Bit-vector of events that have occurred.
+ * @cmd: Details of comedi command in progress.
+ * @wait_head: Task wait queue for file reader or writer.
+ * @cb_mask: Bit-vector of events that should wake waiting tasks.
+ * @inttrig: Software trigger function for command, or NULL.
+ *
+ * Note about the ..._count and ..._ptr members:
+ *
+ * Think of the _Count values being integers of unlimited size, indexing
+ * into a buffer of infinite length (though only an advancing portion
+ * of the buffer of fixed length prealloc_bufsz is accessible at any
+ * time).  Then:
+ *
+ *   Buf_Read_Count <= Buf_Read_Alloc_Count <= Munge_Count <=
+ *   Buf_Write_Count <= Buf_Write_Alloc_Count <=
+ *   (Buf_Read_Count + prealloc_bufsz)
+ *
+ * (Those aren't the actual members, apart from prealloc_bufsz.) When the
+ * buffer is reset, those _Count values start at 0 and only increase in value,
+ * maintaining the above inequalities until the next time the buffer is
+ * reset.  The buffer is divided into the following regions by the inequalities:
+ *
+ *   [0, Buf_Read_Count):
+ *     old region no longer accessible
+ *
+ *   [Buf_Read_Count, Buf_Read_Alloc_Count):
+ *     filled and munged region allocated for reading but not yet read
+ *
+ *   [Buf_Read_Alloc_Count, Munge_Count):
+ *     filled and munged region not yet allocated for reading
+ *
+ *   [Munge_Count, Buf_Write_Count):
+ *     filled region not yet munged
+ *
+ *   [Buf_Write_Count, Buf_Write_Alloc_Count):
+ *     unfilled region allocated for writing but not yet written
+ *
+ *   [Buf_Write_Alloc_Count, Buf_Read_Count + prealloc_bufsz):
+ *     unfilled region not yet allocated for writing
+ *
+ *   [Buf_Read_Count + prealloc_bufsz, infinity):
+ *     unfilled region not yet accessible
+ *
+ * Data needs to be written into the buffer before it can be read out,
+ * and may need to be converted (or "munged") between the two
+ * operations.  Extra unfilled buffer space may need to allocated for
+ * writing (advancing Buf_Write_Alloc_Count) before new data is written.
+ * After writing new data, the newly filled space needs to be released
+ * (advancing Buf_Write_Count).  This also results in the new data being
+ * "munged" (advancing Munge_Count).  Before data is read out of the
+ * buffer, extra space may need to be allocated for reading (advancing
+ * Buf_Read_Alloc_Count).  After the data has been read out, the space
+ * needs to be released (advancing Buf_Read_Count).
+ *
+ * The actual members, buf_read_count, buf_read_alloc_count,
+ * munge_count, buf_write_count, and buf_write_alloc_count take the
+ * value of the corresponding capitalized _Count values modulo 2^32
+ * (UINT_MAX+1).  Subtracting a "higher" _count value from a "lower"
+ * _count value gives the same answer as subtracting a "higher" _Count
+ * value from a lower _Count value because prealloc_bufsz < UINT_MAX+1.
+ * The modulo operation is done implicitly.
+ *
+ * The buf_read_ptr, munge_ptr, and buf_write_ptr members take the value
+ * of the corresponding capitalized _Count values modulo prealloc_bufsz.
+ * These correspond to byte indices in the physical buffer.  The modulo
+ * operation is done by subtracting prealloc_bufsz when the value
+ * exceeds prealloc_bufsz (assuming prealloc_bufsz plus the increment is
+ * less than or equal to UINT_MAX).
+ */
+struct comedi_async {
+	void *prealloc_buf;
+	unsigned int prealloc_bufsz;
+	struct comedi_buf_map *buf_map;
+	unsigned int max_bufsize;
+	unsigned int buf_write_count;
+	unsigned int buf_write_alloc_count;
+	unsigned int buf_read_count;
+	unsigned int buf_read_alloc_count;
+	unsigned int buf_write_ptr;
+	unsigned int buf_read_ptr;
+	unsigned int cur_chan;
+	unsigned int scans_done;
+	unsigned int scan_progress;
+	unsigned int munge_chan;
+	unsigned int munge_count;
+	unsigned int munge_ptr;
+	unsigned int events;
+	struct comedi_cmd cmd;
+	wait_queue_head_t wait_head;
+	unsigned int cb_mask;
+	int (*inttrig)(struct comedi_device *dev, struct comedi_subdevice *s,
+		       unsigned int x);
+};
+
+/**
+ * enum comedi_cb - &struct comedi_async callback "events"
+ * @COMEDI_CB_EOS:		end-of-scan
+ * @COMEDI_CB_EOA:		end-of-acquisition/output
+ * @COMEDI_CB_BLOCK:		data has arrived, wakes up read() / write()
+ * @COMEDI_CB_EOBUF:		DEPRECATED: end of buffer
+ * @COMEDI_CB_ERROR:		card error during acquisition
+ * @COMEDI_CB_OVERFLOW:		buffer overflow/underflow
+ * @COMEDI_CB_ERROR_MASK:	events that indicate an error has occurred
+ * @COMEDI_CB_CANCEL_MASK:	events that will cancel an async command
+ */
+enum comedi_cb {
+	COMEDI_CB_EOS		= BIT(0),
+	COMEDI_CB_EOA		= BIT(1),
+	COMEDI_CB_BLOCK		= BIT(2),
+	COMEDI_CB_EOBUF		= BIT(3),
+	COMEDI_CB_ERROR		= BIT(4),
+	COMEDI_CB_OVERFLOW	= BIT(5),
+	/* masks */
+	COMEDI_CB_ERROR_MASK	= (COMEDI_CB_ERROR | COMEDI_CB_OVERFLOW),
+	COMEDI_CB_CANCEL_MASK	= (COMEDI_CB_EOA | COMEDI_CB_ERROR_MASK)
+};
+
+/**
+ * struct comedi_driver - COMEDI driver registration
+ * @driver_name: Name of driver.
+ * @module: Owning module.
+ * @attach: The optional "attach" handler for manually configured COMEDI
+ *	devices.
+ * @detach: The "detach" handler for deconfiguring COMEDI devices.
+ * @auto_attach: The optional "auto_attach" handler for automatically
+ *	configured COMEDI devices.
+ * @num_names: Optional number of "board names" supported.
+ * @board_name: Optional pointer to a pointer to a board name.  The pointer
+ *	to a board name is embedded in an element of a driver-defined array
+ *	of static, read-only board type information.
+ * @offset: Optional size of each element of the driver-defined array of
+ *	static, read-only board type information, i.e. the offset between each
+ *	pointer to a board name.
+ *
+ * This is used with comedi_driver_register() and comedi_driver_unregister() to
+ * register and unregister a low-level COMEDI driver with the COMEDI core.
+ *
+ * If @num_names is non-zero, @board_name should be non-NULL, and @offset
+ * should be at least sizeof(*board_name).  These are used by the handler for
+ * the %COMEDI_DEVCONFIG ioctl to match a hardware device and its driver by
+ * board name.  If @num_names is zero, the %COMEDI_DEVCONFIG ioctl matches a
+ * hardware device and its driver by driver name.  This is only useful if the
+ * @attach handler is set.  If @num_names is non-zero, the driver's @attach
+ * handler will be called with the COMEDI device structure's board_ptr member
+ * pointing to the matched pointer to a board name within the driver's private
+ * array of static, read-only board type information.
+ *
+ * The @detach handler has two roles.  If a COMEDI device was successfully
+ * configured by the @attach or @auto_attach handler, it is called when the
+ * device is being deconfigured (by the %COMEDI_DEVCONFIG ioctl, or due to
+ * unloading of the driver, or due to device removal).  It is also called when
+ * the @attach or @auto_attach handler returns an error.  Therefore, the
+ * @attach or @auto_attach handlers can defer clean-up on error until the
+ * @detach handler is called.  If the @attach or @auto_attach handlers free
+ * any resources themselves, they must prevent the @detach handler from
+ * freeing the same resources.  The @detach handler must not assume that all
+ * resources requested by the @attach or @auto_attach handler were
+ * successfully allocated.
+ */
+struct comedi_driver {
+	/* private: */
+	struct comedi_driver *next;	/* Next in list of COMEDI drivers. */
+	/* public: */
+	const char *driver_name;
+	struct module *module;
+	int (*attach)(struct comedi_device *dev, struct comedi_devconfig *it);
+	void (*detach)(struct comedi_device *dev);
+	int (*auto_attach)(struct comedi_device *dev, unsigned long context);
+	unsigned int num_names;
+	const char *const *board_name;
+	int offset;
+};
+
+/**
+ * struct comedi_device - Working data for a COMEDI device
+ * @use_count: Number of open file objects.
+ * @driver: Low-level COMEDI driver attached to this COMEDI device.
+ * @pacer: Optional pointer to a dynamically allocated acquisition pacer
+ *	control.  It is freed automatically after the COMEDI device is
+ *	detached from the low-level driver.
+ * @private: Optional pointer to private data allocated by the low-level
+ *	driver.  It is freed automatically after the COMEDI device is
+ *	detached from the low-level driver.
+ * @class_dev: Sysfs comediX device.
+ * @minor: Minor device number of COMEDI char device (0-47).
+ * @detach_count: Counter incremented every time the COMEDI device is detached.
+ *	Used for checking a previous attachment is still valid.
+ * @hw_dev: Optional pointer to the low-level hardware &struct device.  It is
+ *	required for automatically configured COMEDI devices and optional for
+ *	COMEDI devices configured by the %COMEDI_DEVCONFIG ioctl, although
+ *	the bus-specific COMEDI functions only work if it is set correctly.
+ *	It is also passed to dma_alloc_coherent() for COMEDI subdevices that
+ *	have their 'async_dma_dir' member set to something other than
+ *	%DMA_NONE.
+ * @board_name: Pointer to a COMEDI board name or a COMEDI driver name.  When
+ *	the low-level driver's "attach" handler is called by the handler for
+ *	the %COMEDI_DEVCONFIG ioctl, it either points to a matched board name
+ *	string if the 'num_names' member of the &struct comedi_driver is
+ *	non-zero, otherwise it points to the low-level driver name string.
+ *	When the low-lever driver's "auto_attach" handler is called for an
+ *	automatically configured COMEDI device, it points to the low-level
+ *	driver name string.  The low-level driver is free to change it in its
+ *	"attach" or "auto_attach" handler if it wishes.
+ * @board_ptr: Optional pointer to private, read-only board type information in
+ *	the low-level driver.  If the 'num_names' member of the &struct
+ *	comedi_driver is non-zero, the handler for the %COMEDI_DEVCONFIG ioctl
+ *	will point it to a pointer to a matched board name string within the
+ *	driver's private array of static, read-only board type information when
+ *	calling the driver's "attach" handler.  The low-level driver is free to
+ *	change it.
+ * @attached: Flag indicating that the COMEDI device is attached to a low-level
+ *	driver.
+ * @ioenabled: Flag used to indicate that a PCI device has been enabled and
+ *	its regions requested.
+ * @spinlock: Generic spin-lock for use by the low-level driver.
+ * @mutex: Generic mutex for use by the COMEDI core module.
+ * @attach_lock: &struct rw_semaphore used to guard against the COMEDI device
+ *	being detached while an operation is in progress.  The down_write()
+ *	operation is only allowed while @mutex is held and is used when
+ *	changing @attached and @detach_count and calling the low-level driver's
+ *	"detach" handler.  The down_read() operation is generally used without
+ *	holding @mutex.
+ * @refcount: &struct kref reference counter for freeing COMEDI device.
+ * @n_subdevices: Number of COMEDI subdevices allocated by the low-level
+ *	driver for this device.
+ * @subdevices: Dynamically allocated array of COMEDI subdevices.
+ * @mmio: Optional pointer to a remapped MMIO region set by the low-level
+ *	driver.
+ * @iobase: Optional base of an I/O port region requested by the low-level
+ *	driver.
+ * @iolen: Length of I/O port region requested at @iobase.
+ * @irq: Optional IRQ number requested by the low-level driver.
+ * @read_subdev: Optional pointer to a default COMEDI subdevice operated on by
+ *	the read() file operation.  Set by the low-level driver.
+ * @write_subdev: Optional pointer to a default COMEDI subdevice operated on by
+ *	the write() file operation.  Set by the low-level driver.
+ * @async_queue: Storage for fasync_helper().
+ * @open: Optional pointer to a function set by the low-level driver to be
+ *	called when @use_count changes from 0 to 1.
+ * @close: Optional pointer to a function set by the low-level driver to be
+ *	called when @use_count changed from 1 to 0.
+ * @insn_device_config: Optional pointer to a handler for all sub-instructions
+ *	except %INSN_DEVICE_CONFIG_GET_ROUTES of the %INSN_DEVICE_CONFIG
+ *	instruction.  If this is not initialized by the low-level driver, a
+ *	default handler will be set during post-configuration.
+ * @get_valid_routes: Optional pointer to a handler for the
+ *	%INSN_DEVICE_CONFIG_GET_ROUTES sub-instruction of the
+ *	%INSN_DEVICE_CONFIG instruction set.  If this is not initialized by the
+ *	low-level driver, a default handler that copies zero routes back to the
+ *	user will be used.
+ *
+ * This is the main control data structure for a COMEDI device (as far as the
+ * COMEDI core is concerned).  There are two groups of COMEDI devices -
+ * "legacy" devices that are configured by the handler for the
+ * %COMEDI_DEVCONFIG ioctl, and automatically configured devices resulting
+ * from a call to comedi_auto_config() as a result of a bus driver probe in
+ * a low-level COMEDI driver.  The "legacy" COMEDI devices are allocated
+ * during module initialization if the "comedi_num_legacy_minors" module
+ * parameter is non-zero and use minor device numbers from 0 to
+ * comedi_num_legacy_minors minus one.  The automatically configured COMEDI
+ * devices are allocated on demand and use minor device numbers from
+ * comedi_num_legacy_minors to 47.
+ */
+struct comedi_device {
+	int use_count;
+	struct comedi_driver *driver;
+	struct comedi_8254 *pacer;
+	void *private;
+
+	struct device *class_dev;
+	int minor;
+	unsigned int detach_count;
+	struct device *hw_dev;
+
+	const char *board_name;
+	const void *board_ptr;
+	unsigned int attached:1;
+	unsigned int ioenabled:1;
+	spinlock_t spinlock;	/* generic spin-lock for low-level driver */
+	struct mutex mutex;	/* generic mutex for COMEDI core */
+	struct rw_semaphore attach_lock;
+	struct kref refcount;
+
+	int n_subdevices;
+	struct comedi_subdevice *subdevices;
+
+	/* dumb */
+	void __iomem *mmio;
+	unsigned long iobase;
+	unsigned long iolen;
+	unsigned int irq;
+
+	struct comedi_subdevice *read_subdev;
+	struct comedi_subdevice *write_subdev;
+
+	struct fasync_struct *async_queue;
+
+	int (*open)(struct comedi_device *dev);
+	void (*close)(struct comedi_device *dev);
+	int (*insn_device_config)(struct comedi_device *dev,
+				  struct comedi_insn *insn, unsigned int *data);
+	unsigned int (*get_valid_routes)(struct comedi_device *dev,
+					 unsigned int n_pairs,
+					 unsigned int *pair_data);
+};
+
+/*
+ * function prototypes
+ */
+
+void comedi_event(struct comedi_device *dev, struct comedi_subdevice *s);
+
+struct comedi_device *comedi_dev_get_from_minor(unsigned int minor);
+int comedi_dev_put(struct comedi_device *dev);
+
+bool comedi_is_subdevice_running(struct comedi_subdevice *s);
+
+void *comedi_alloc_spriv(struct comedi_subdevice *s, size_t size);
+void comedi_set_spriv_auto_free(struct comedi_subdevice *s);
+
+int comedi_check_chanlist(struct comedi_subdevice *s,
+			  int n,
+			  unsigned int *chanlist);
+
+/* range stuff */
+
+#define RANGE(a, b)		{(a) * 1e6, (b) * 1e6, 0}
+#define RANGE_ext(a, b)		{(a) * 1e6, (b) * 1e6, RF_EXTERNAL}
+#define RANGE_mA(a, b)		{(a) * 1e6, (b) * 1e6, UNIT_mA}
+#define RANGE_unitless(a, b)	{(a) * 1e6, (b) * 1e6, 0}
+#define BIP_RANGE(a)		{-(a) * 1e6, (a) * 1e6, 0}
+#define UNI_RANGE(a)		{0, (a) * 1e6, 0}
+
+extern const struct comedi_lrange range_bipolar10;
+extern const struct comedi_lrange range_bipolar5;
+extern const struct comedi_lrange range_bipolar2_5;
+extern const struct comedi_lrange range_unipolar10;
+extern const struct comedi_lrange range_unipolar5;
+extern const struct comedi_lrange range_unipolar2_5;
+extern const struct comedi_lrange range_0_20mA;
+extern const struct comedi_lrange range_4_20mA;
+extern const struct comedi_lrange range_0_32mA;
+extern const struct comedi_lrange range_unknown;
+
+#define range_digital		range_unipolar5
+
+/**
+ * struct comedi_lrange - Describes a COMEDI range table
+ * @length: Number of entries in the range table.
+ * @range: Array of &struct comedi_krange, one for each range.
+ *
+ * Each element of @range[] describes the minimum and maximum physical range
+ * and the type of units.  Typically, the type of unit is %UNIT_volt
+ * (i.e. volts) and the minimum and maximum are in millionths of a volt.
+ * There may also be a flag that indicates the minimum and maximum are merely
+ * scale factors for an unknown, external reference.
+ */
+struct comedi_lrange {
+	int length;
+	struct comedi_krange range[];
+};
+
+/**
+ * comedi_range_is_bipolar() - Test if subdevice range is bipolar
+ * @s: COMEDI subdevice.
+ * @range: Index of range within a range table.
+ *
+ * Tests whether a range is bipolar by checking whether its minimum value
+ * is negative.
+ *
+ * Assumes @range is valid.  Does not work for subdevices using a
+ * channel-specific range table list.
+ *
+ * Return:
+ *	%true if the range is bipolar.
+ *	%false if the range is unipolar.
+ */
+static inline bool comedi_range_is_bipolar(struct comedi_subdevice *s,
+					   unsigned int range)
+{
+	return s->range_table->range[range].min < 0;
+}
+
+/**
+ * comedi_range_is_unipolar() - Test if subdevice range is unipolar
+ * @s: COMEDI subdevice.
+ * @range: Index of range within a range table.
+ *
+ * Tests whether a range is unipolar by checking whether its minimum value
+ * is at least 0.
+ *
+ * Assumes @range is valid.  Does not work for subdevices using a
+ * channel-specific range table list.
+ *
+ * Return:
+ *	%true if the range is unipolar.
+ *	%false if the range is bipolar.
+ */
+static inline bool comedi_range_is_unipolar(struct comedi_subdevice *s,
+					    unsigned int range)
+{
+	return s->range_table->range[range].min >= 0;
+}
+
+/**
+ * comedi_range_is_external() - Test if subdevice range is external
+ * @s: COMEDI subdevice.
+ * @range: Index of range within a range table.
+ *
+ * Tests whether a range is externally reference by checking whether its
+ * %RF_EXTERNAL flag is set.
+ *
+ * Assumes @range is valid.  Does not work for subdevices using a
+ * channel-specific range table list.
+ *
+ * Return:
+ *	%true if the range is external.
+ *	%false if the range is internal.
+ */
+static inline bool comedi_range_is_external(struct comedi_subdevice *s,
+					    unsigned int range)
+{
+	return !!(s->range_table->range[range].flags & RF_EXTERNAL);
+}
+
+/**
+ * comedi_chan_range_is_bipolar() - Test if channel-specific range is bipolar
+ * @s: COMEDI subdevice.
+ * @chan: The channel number.
+ * @range: Index of range within a range table.
+ *
+ * Tests whether a range is bipolar by checking whether its minimum value
+ * is negative.
+ *
+ * Assumes @chan and @range are valid.  Only works for subdevices with a
+ * channel-specific range table list.
+ *
+ * Return:
+ *	%true if the range is bipolar.
+ *	%false if the range is unipolar.
+ */
+static inline bool comedi_chan_range_is_bipolar(struct comedi_subdevice *s,
+						unsigned int chan,
+						unsigned int range)
+{
+	return s->range_table_list[chan]->range[range].min < 0;
+}
+
+/**
+ * comedi_chan_range_is_unipolar() - Test if channel-specific range is unipolar
+ * @s: COMEDI subdevice.
+ * @chan: The channel number.
+ * @range: Index of range within a range table.
+ *
+ * Tests whether a range is unipolar by checking whether its minimum value
+ * is at least 0.
+ *
+ * Assumes @chan and @range are valid.  Only works for subdevices with a
+ * channel-specific range table list.
+ *
+ * Return:
+ *	%true if the range is unipolar.
+ *	%false if the range is bipolar.
+ */
+static inline bool comedi_chan_range_is_unipolar(struct comedi_subdevice *s,
+						 unsigned int chan,
+						 unsigned int range)
+{
+	return s->range_table_list[chan]->range[range].min >= 0;
+}
+
+/**
+ * comedi_chan_range_is_external() - Test if channel-specific range is external
+ * @s: COMEDI subdevice.
+ * @chan: The channel number.
+ * @range: Index of range within a range table.
+ *
+ * Tests whether a range is externally reference by checking whether its
+ * %RF_EXTERNAL flag is set.
+ *
+ * Assumes @chan and @range are valid.  Only works for subdevices with a
+ * channel-specific range table list.
+ *
+ * Return:
+ *	%true if the range is bipolar.
+ *	%false if the range is unipolar.
+ */
+static inline bool comedi_chan_range_is_external(struct comedi_subdevice *s,
+						 unsigned int chan,
+						 unsigned int range)
+{
+	return !!(s->range_table_list[chan]->range[range].flags & RF_EXTERNAL);
+}
+
+/**
+ * comedi_offset_munge() - Convert between offset binary and 2's complement
+ * @s: COMEDI subdevice.
+ * @val: Value to be converted.
+ *
+ * Toggles the highest bit of a sample value to toggle between offset binary
+ * and 2's complement.  Assumes that @s->maxdata is a power of 2 minus 1.
+ *
+ * Return: The converted value.
+ */
+static inline unsigned int comedi_offset_munge(struct comedi_subdevice *s,
+					       unsigned int val)
+{
+	return val ^ s->maxdata ^ (s->maxdata >> 1);
+}
+
+/**
+ * comedi_bytes_per_sample() - Determine subdevice sample size
+ * @s: COMEDI subdevice.
+ *
+ * The sample size will be 4 (sizeof int) or 2 (sizeof short) depending on
+ * whether the %SDF_LSAMPL subdevice flag is set or not.
+ *
+ * Return: The subdevice sample size.
+ */
+static inline unsigned int comedi_bytes_per_sample(struct comedi_subdevice *s)
+{
+	return s->subdev_flags & SDF_LSAMPL ? sizeof(int) : sizeof(short);
+}
+
+/**
+ * comedi_sample_shift() - Determine log2 of subdevice sample size
+ * @s: COMEDI subdevice.
+ *
+ * The sample size will be 4 (sizeof int) or 2 (sizeof short) depending on
+ * whether the %SDF_LSAMPL subdevice flag is set or not.  The log2 of the
+ * sample size will be 2 or 1 and can be used as the right operand of a
+ * bit-shift operator to multiply or divide something by the sample size.
+ *
+ * Return: log2 of the subdevice sample size.
+ */
+static inline unsigned int comedi_sample_shift(struct comedi_subdevice *s)
+{
+	return s->subdev_flags & SDF_LSAMPL ? 2 : 1;
+}
+
+/**
+ * comedi_bytes_to_samples() - Convert a number of bytes to a number of samples
+ * @s: COMEDI subdevice.
+ * @nbytes: Number of bytes
+ *
+ * Return: The number of bytes divided by the subdevice sample size.
+ */
+static inline unsigned int comedi_bytes_to_samples(struct comedi_subdevice *s,
+						   unsigned int nbytes)
+{
+	return nbytes >> comedi_sample_shift(s);
+}
+
+/**
+ * comedi_samples_to_bytes() - Convert a number of samples to a number of bytes
+ * @s: COMEDI subdevice.
+ * @nsamples: Number of samples.
+ *
+ * Return: The number of samples multiplied by the subdevice sample size.
+ * (Does not check for arithmetic overflow.)
+ */
+static inline unsigned int comedi_samples_to_bytes(struct comedi_subdevice *s,
+						   unsigned int nsamples)
+{
+	return nsamples << comedi_sample_shift(s);
+}
+
+/**
+ * comedi_check_trigger_src() - Trivially validate a comedi_cmd trigger source
+ * @src: Pointer to the trigger source to validate.
+ * @flags: Bitmask of valid %TRIG_* for the trigger.
+ *
+ * This is used in "step 1" of the do_cmdtest functions of comedi drivers
+ * to validate the comedi_cmd triggers. The mask of the @src against the
+ * @flags allows the userspace comedilib to pass all the comedi_cmd
+ * triggers as %TRIG_ANY and get back a bitmask of the valid trigger sources.
+ *
+ * Return:
+ *	0 if trigger sources in *@src are all supported.
+ *	-EINVAL if any trigger source in *@src is unsupported.
+ */
+static inline int comedi_check_trigger_src(unsigned int *src,
+					   unsigned int flags)
+{
+	unsigned int orig_src = *src;
+
+	*src = orig_src & flags;
+	if (*src == TRIG_INVALID || *src != orig_src)
+		return -EINVAL;
+	return 0;
+}
+
+/**
+ * comedi_check_trigger_is_unique() - Make sure a trigger source is unique
+ * @src: The trigger source to check.
+ *
+ * Return:
+ *	0 if no more than one trigger source is set.
+ *	-EINVAL if more than one trigger source is set.
+ */
+static inline int comedi_check_trigger_is_unique(unsigned int src)
+{
+	/* this test is true if more than one _src bit is set */
+	if ((src & (src - 1)) != 0)
+		return -EINVAL;
+	return 0;
+}
+
+/**
+ * comedi_check_trigger_arg_is() - Trivially validate a trigger argument
+ * @arg: Pointer to the trigger arg to validate.
+ * @val: The value the argument should be.
+ *
+ * Forces *@arg to be @val.
+ *
+ * Return:
+ *	0 if *@arg was already @val.
+ *	-EINVAL if *@arg differed from @val.
+ */
+static inline int comedi_check_trigger_arg_is(unsigned int *arg,
+					      unsigned int val)
+{
+	if (*arg != val) {
+		*arg = val;
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * comedi_check_trigger_arg_min() - Trivially validate a trigger argument min
+ * @arg: Pointer to the trigger arg to validate.
+ * @val: The minimum value the argument should be.
+ *
+ * Forces *@arg to be at least @val, setting it to @val if necessary.
+ *
+ * Return:
+ *	0 if *@arg was already at least @val.
+ *	-EINVAL if *@arg was less than @val.
+ */
+static inline int comedi_check_trigger_arg_min(unsigned int *arg,
+					       unsigned int val)
+{
+	if (*arg < val) {
+		*arg = val;
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * comedi_check_trigger_arg_max() - Trivially validate a trigger argument max
+ * @arg: Pointer to the trigger arg to validate.
+ * @val: The maximum value the argument should be.
+ *
+ * Forces *@arg to be no more than @val, setting it to @val if necessary.
+ *
+ * Return:
+ *	0 if*@arg was already no more than @val.
+ *	-EINVAL if *@arg was greater than @val.
+ */
+static inline int comedi_check_trigger_arg_max(unsigned int *arg,
+					       unsigned int val)
+{
+	if (*arg > val) {
+		*arg = val;
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Must set dev->hw_dev if you wish to dma directly into comedi's buffer.
+ * Also useful for retrieving a previously configured hardware device of
+ * known bus type.  Set automatically for auto-configured devices.
+ * Automatically set to NULL when detaching hardware device.
+ */
+int comedi_set_hw_dev(struct comedi_device *dev, struct device *hw_dev);
+
+/**
+ * comedi_buf_n_bytes_ready - Determine amount of unread data in buffer
+ * @s: COMEDI subdevice.
+ *
+ * Determines the number of bytes of unread data in the asynchronous
+ * acquisition data buffer for a subdevice.  The data in question might not
+ * have been fully "munged" yet.
+ *
+ * Returns: The amount of unread data in bytes.
+ */
+static inline unsigned int comedi_buf_n_bytes_ready(struct comedi_subdevice *s)
+{
+	return s->async->buf_write_count - s->async->buf_read_count;
+}
+
+unsigned int comedi_buf_write_alloc(struct comedi_subdevice *s, unsigned int n);
+unsigned int comedi_buf_write_free(struct comedi_subdevice *s, unsigned int n);
+
+unsigned int comedi_buf_read_n_available(struct comedi_subdevice *s);
+unsigned int comedi_buf_read_alloc(struct comedi_subdevice *s, unsigned int n);
+unsigned int comedi_buf_read_free(struct comedi_subdevice *s, unsigned int n);
+
+unsigned int comedi_buf_write_samples(struct comedi_subdevice *s,
+				      const void *data, unsigned int nsamples);
+unsigned int comedi_buf_read_samples(struct comedi_subdevice *s,
+				     void *data, unsigned int nsamples);
+
+/* drivers.c - general comedi driver functions */
+
+#define COMEDI_TIMEOUT_MS	1000
+
+int comedi_timeout(struct comedi_device *dev, struct comedi_subdevice *s,
+		   struct comedi_insn *insn,
+		   int (*cb)(struct comedi_device *dev,
+			     struct comedi_subdevice *s,
+			     struct comedi_insn *insn, unsigned long context),
+		   unsigned long context);
+
+unsigned int comedi_handle_events(struct comedi_device *dev,
+				  struct comedi_subdevice *s);
+
+int comedi_dio_insn_config(struct comedi_device *dev,
+			   struct comedi_subdevice *s,
+			   struct comedi_insn *insn, unsigned int *data,
+			   unsigned int mask);
+unsigned int comedi_dio_update_state(struct comedi_subdevice *s,
+				     unsigned int *data);
+unsigned int comedi_bytes_per_scan_cmd(struct comedi_subdevice *s,
+				       struct comedi_cmd *cmd);
+unsigned int comedi_bytes_per_scan(struct comedi_subdevice *s);
+unsigned int comedi_nscans_left(struct comedi_subdevice *s,
+				unsigned int nscans);
+unsigned int comedi_nsamples_left(struct comedi_subdevice *s,
+				  unsigned int nsamples);
+void comedi_inc_scan_progress(struct comedi_subdevice *s,
+			      unsigned int num_bytes);
+
+void *comedi_alloc_devpriv(struct comedi_device *dev, size_t size);
+int comedi_alloc_subdevices(struct comedi_device *dev, int num_subdevices);
+int comedi_alloc_subdev_readback(struct comedi_subdevice *s);
+
+int comedi_readback_insn_read(struct comedi_device *dev,
+			      struct comedi_subdevice *s,
+			      struct comedi_insn *insn, unsigned int *data);
+
+int comedi_load_firmware(struct comedi_device *dev, struct device *hw_dev,
+			 const char *name,
+			 int (*cb)(struct comedi_device *dev,
+				   const u8 *data, size_t size,
+				   unsigned long context),
+			 unsigned long context);
+
+int __comedi_request_region(struct comedi_device *dev,
+			    unsigned long start, unsigned long len);
+int comedi_request_region(struct comedi_device *dev,
+			  unsigned long start, unsigned long len);
+void comedi_legacy_detach(struct comedi_device *dev);
+
+int comedi_auto_config(struct device *hardware_device,
+		       struct comedi_driver *driver, unsigned long context);
+void comedi_auto_unconfig(struct device *hardware_device);
+
+int comedi_driver_register(struct comedi_driver *driver);
+void comedi_driver_unregister(struct comedi_driver *driver);
+
+/**
+ * module_comedi_driver() - Helper macro for registering a comedi driver
+ * @__comedi_driver: comedi_driver struct
+ *
+ * Helper macro for comedi drivers which do not do anything special in module
+ * init/exit. This eliminates a lot of boilerplate. Each module may only use
+ * this macro once, and calling it replaces module_init() and module_exit().
+ */
+#define module_comedi_driver(__comedi_driver) \
+	module_driver(__comedi_driver, comedi_driver_register, \
+			comedi_driver_unregister)
+
+#endif /* _COMEDIDEV_H */
diff --git a/include/linux/comedi/comedilib.h b/include/linux/comedi/comedilib.h
new file mode 100644
index 000000000000..0223c9cd9215
--- /dev/null
+++ b/include/linux/comedi/comedilib.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * comedilib.h
+ * Header file for kcomedilib
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 1998-2001 David A. Schleef <ds@schleef.org>
+ */
+
+#ifndef _LINUX_COMEDILIB_H
+#define _LINUX_COMEDILIB_H
+
+struct comedi_device *comedi_open(const char *path);
+int comedi_close(struct comedi_device *dev);
+int comedi_dio_get_config(struct comedi_device *dev, unsigned int subdev,
+			  unsigned int chan, unsigned int *io);
+int comedi_dio_config(struct comedi_device *dev, unsigned int subdev,
+		      unsigned int chan, unsigned int io);
+int comedi_dio_bitfield2(struct comedi_device *dev, unsigned int subdev,
+			 unsigned int mask, unsigned int *bits,
+			 unsigned int base_channel);
+int comedi_find_subdevice_by_type(struct comedi_device *dev, int type,
+				  unsigned int subd);
+int comedi_get_n_channels(struct comedi_device *dev, unsigned int subdevice);
+
+#endif
-- 
cgit v1.2.3


From 631e272b12075b60f7c7fc4f84f937d78a699844 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 17 Nov 2021 12:06:01 +0000
Subject: comedi: Move and rename "8255.h" to <linux/comedi/comedi_8255.h>

Some of the header files in "drivers/comedi/drivers/" are common enough
to be useful to out-of-tree comedi driver modules.  Using them for
out-of-tree module builds is hampered by the headers being outside the
"include/" directory so it is desirable to move them.

There are about a couple of dozen Comedi device drivers that use the
"comedi_8255" module to add digital I/O subdevices based on the
venerable 8255 Programmable Peripheral Interface chip.  The macros and
declarations to use that module are in the "8255.h" header file in the
comedi "drivers" directory.  Move it into "include/linux/comedi/" and
rename it to "comedi_8255.h" for naming consistency reasons.

Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20211117120604.117740-4-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/comedi/comedi_8255.h | 42 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 include/linux/comedi/comedi_8255.h

(limited to 'include/linux')

diff --git a/include/linux/comedi/comedi_8255.h b/include/linux/comedi/comedi_8255.h
new file mode 100644
index 000000000000..b2a5bc6b3a49
--- /dev/null
+++ b/include/linux/comedi/comedi_8255.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * comedi_8255.h
+ * Generic 8255 digital I/O subdevice support
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 1998 David A. Schleef <ds@schleef.org>
+ */
+
+#ifndef _COMEDI_8255_H
+#define _COMEDI_8255_H
+
+#define I8255_SIZE		0x04
+
+#define I8255_DATA_A_REG	0x00
+#define I8255_DATA_B_REG	0x01
+#define I8255_DATA_C_REG	0x02
+#define I8255_CTRL_REG		0x03
+#define I8255_CTRL_C_LO_IO	BIT(0)
+#define I8255_CTRL_B_IO		BIT(1)
+#define I8255_CTRL_B_MODE	BIT(2)
+#define I8255_CTRL_C_HI_IO	BIT(3)
+#define I8255_CTRL_A_IO		BIT(4)
+#define I8255_CTRL_A_MODE(x)	((x) << 5)
+#define I8255_CTRL_CW		BIT(7)
+
+struct comedi_device;
+struct comedi_subdevice;
+
+int subdev_8255_init(struct comedi_device *dev, struct comedi_subdevice *s,
+		     int (*io)(struct comedi_device *dev, int dir, int port,
+			       int data, unsigned long regbase),
+		     unsigned long regbase);
+
+int subdev_8255_mm_init(struct comedi_device *dev, struct comedi_subdevice *s,
+			int (*io)(struct comedi_device *dev, int dir, int port,
+				  int data, unsigned long regbase),
+			unsigned long regbase);
+
+unsigned long subdev_8255_regbase(struct comedi_subdevice *s);
+
+#endif
-- 
cgit v1.2.3


From 44fb7affcfa4e968e9c2ede023ef0e15f06d8209 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 17 Nov 2021 12:06:02 +0000
Subject: comedi: Move "comedi_8254.h" to <linux/comedi/comedi_8254.h>

Some of the header files in "drivers/comedi/drivers/" are common enough
to be useful to out-of-tree comedi driver modules.  Using them for
out-of-tree module builds is hampered by the headers being outside the
"include/" directory so it is desirable to move them.

There are about a couple of dozen or so Comedi device drivers that use
the "comedi_8254" module to add timers based on the venerable 8254
Programmable Interval Timer chip.  The macros and declarations to use
that module are in the "comedi_8254.h" header file in the comedi
"drivers" directory.  Move it into "include/linux/comedi/".

Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20211117120604.117740-5-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/comedi/comedi_8254.h | 134 +++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 include/linux/comedi/comedi_8254.h

(limited to 'include/linux')

diff --git a/include/linux/comedi/comedi_8254.h b/include/linux/comedi/comedi_8254.h
new file mode 100644
index 000000000000..d8264417e53c
--- /dev/null
+++ b/include/linux/comedi/comedi_8254.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * comedi_8254.h
+ * Generic 8254 timer/counter support
+ * Copyright (C) 2014 H Hartley Sweeten <hsweeten@visionengravers.com>
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 2000 David A. Schleef <ds@schleef.org>
+ */
+
+#ifndef _COMEDI_8254_H
+#define _COMEDI_8254_H
+
+#include <linux/types.h>
+
+struct comedi_device;
+struct comedi_insn;
+struct comedi_subdevice;
+
+/*
+ * Common oscillator base values in nanoseconds
+ */
+#define I8254_OSC_BASE_10MHZ	100
+#define I8254_OSC_BASE_5MHZ	200
+#define I8254_OSC_BASE_4MHZ	250
+#define I8254_OSC_BASE_2MHZ	500
+#define I8254_OSC_BASE_1MHZ	1000
+#define I8254_OSC_BASE_100KHZ	10000
+#define I8254_OSC_BASE_10KHZ	100000
+#define I8254_OSC_BASE_1KHZ	1000000
+
+/*
+ * I/O access size used to read/write registers
+ */
+#define I8254_IO8		1
+#define I8254_IO16		2
+#define I8254_IO32		4
+
+/*
+ * Register map for generic 8254 timer (I8254_IO8 with 0 regshift)
+ */
+#define I8254_COUNTER0_REG		0x00
+#define I8254_COUNTER1_REG		0x01
+#define I8254_COUNTER2_REG		0x02
+#define I8254_CTRL_REG			0x03
+#define I8254_CTRL_SEL_CTR(x)		((x) << 6)
+#define I8254_CTRL_READBACK(x)		(I8254_CTRL_SEL_CTR(3) | BIT(x))
+#define I8254_CTRL_READBACK_COUNT	I8254_CTRL_READBACK(4)
+#define I8254_CTRL_READBACK_STATUS	I8254_CTRL_READBACK(5)
+#define I8254_CTRL_READBACK_SEL_CTR(x)	(2 << (x))
+#define I8254_CTRL_RW(x)		(((x) & 0x3) << 4)
+#define I8254_CTRL_LATCH		I8254_CTRL_RW(0)
+#define I8254_CTRL_LSB_ONLY		I8254_CTRL_RW(1)
+#define I8254_CTRL_MSB_ONLY		I8254_CTRL_RW(2)
+#define I8254_CTRL_LSB_MSB		I8254_CTRL_RW(3)
+
+/* counter maps zero to 0x10000 */
+#define I8254_MAX_COUNT			0x10000
+
+/**
+ * struct comedi_8254 - private data used by this module
+ * @iobase:		PIO base address of the registers (in/out)
+ * @mmio:		MMIO base address of the registers (read/write)
+ * @iosize:		I/O size used to access the registers (b/w/l)
+ * @regshift:		register gap shift
+ * @osc_base:		cascaded oscillator speed in ns
+ * @divisor:		divisor for single counter
+ * @divisor1:		divisor loaded into first cascaded counter
+ * @divisor2:		divisor loaded into second cascaded counter
+ * #next_div:		next divisor for single counter
+ * @next_div1:		next divisor to use for first cascaded counter
+ * @next_div2:		next divisor to use for second cascaded counter
+ * @clock_src;		current clock source for each counter (driver specific)
+ * @gate_src;		current gate source  for each counter (driver specific)
+ * @busy:		flags used to indicate that a counter is "busy"
+ * @insn_config:	driver specific (*insn_config) callback
+ */
+struct comedi_8254 {
+	unsigned long iobase;
+	void __iomem *mmio;
+	unsigned int iosize;
+	unsigned int regshift;
+	unsigned int osc_base;
+	unsigned int divisor;
+	unsigned int divisor1;
+	unsigned int divisor2;
+	unsigned int next_div;
+	unsigned int next_div1;
+	unsigned int next_div2;
+	unsigned int clock_src[3];
+	unsigned int gate_src[3];
+	bool busy[3];
+
+	int (*insn_config)(struct comedi_device *dev,
+			   struct comedi_subdevice *s,
+			   struct comedi_insn *insn, unsigned int *data);
+};
+
+unsigned int comedi_8254_status(struct comedi_8254 *i8254,
+				unsigned int counter);
+unsigned int comedi_8254_read(struct comedi_8254 *i8254, unsigned int counter);
+void comedi_8254_write(struct comedi_8254 *i8254,
+		       unsigned int counter, unsigned int val);
+
+int comedi_8254_set_mode(struct comedi_8254 *i8254,
+			 unsigned int counter, unsigned int mode);
+int comedi_8254_load(struct comedi_8254 *i8254,
+		     unsigned int counter, unsigned int val, unsigned int mode);
+
+void comedi_8254_pacer_enable(struct comedi_8254 *i8254,
+			      unsigned int counter1, unsigned int counter2,
+			      bool enable);
+void comedi_8254_update_divisors(struct comedi_8254 *i8254);
+void comedi_8254_cascade_ns_to_timer(struct comedi_8254 *i8254,
+				     unsigned int *nanosec, unsigned int flags);
+void comedi_8254_ns_to_timer(struct comedi_8254 *i8254,
+			     unsigned int *nanosec, unsigned int flags);
+
+void comedi_8254_set_busy(struct comedi_8254 *i8254,
+			  unsigned int counter, bool busy);
+
+void comedi_8254_subdevice_init(struct comedi_subdevice *s,
+				struct comedi_8254 *i8254);
+
+struct comedi_8254 *comedi_8254_init(unsigned long iobase,
+				     unsigned int osc_base,
+				     unsigned int iosize,
+				     unsigned int regshift);
+struct comedi_8254 *comedi_8254_mm_init(void __iomem *mmio,
+					unsigned int osc_base,
+					unsigned int iosize,
+					unsigned int regshift);
+
+#endif	/* _COMEDI_8254_H */
-- 
cgit v1.2.3


From fe7a4f5b9548456246ffda143bab59922acda9fd Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 17 Nov 2021 12:06:03 +0000
Subject: comedi: Move "comedi_isadma.h" to <linux/comedi/comedi_isadma.h>

Some of the header files in "drivers/comedi/drivers/" are common enough
to be useful to out-of-tree comedi driver modules.  Using them for
out-of-tree module builds is hampered by the headers being outside the
"include/" directory so it is desirable to move them.

There are about a half a dozen or so Comedi device drivers that use the
"comedi_isadma" module to add ISA DMA support.  The macros and
declarations to use that module are in the "comedi_isadma.h" header file
in the comedi "drivers" directory.  Move it into
"include/linux/comedi/".

Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20211117120604.117740-6-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/comedi/comedi_isadma.h | 114 +++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 include/linux/comedi/comedi_isadma.h

(limited to 'include/linux')

diff --git a/include/linux/comedi/comedi_isadma.h b/include/linux/comedi/comedi_isadma.h
new file mode 100644
index 000000000000..9d2b12db7e6e
--- /dev/null
+++ b/include/linux/comedi/comedi_isadma.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * COMEDI ISA DMA support functions
+ * Copyright (c) 2014 H Hartley Sweeten <hsweeten@visionengravers.com>
+ */
+
+#ifndef _COMEDI_ISADMA_H
+#define _COMEDI_ISADMA_H
+
+#include <linux/types.h>
+
+struct comedi_device;
+struct device;
+
+/*
+ * These are used to avoid issues when <asm/dma.h> and the DMA_MODE_
+ * defines are not available.
+ */
+#define COMEDI_ISADMA_READ	0
+#define COMEDI_ISADMA_WRITE	1
+
+/**
+ * struct comedi_isadma_desc - cookie for ISA DMA
+ * @virt_addr:	virtual address of buffer
+ * @hw_addr:	hardware (bus) address of buffer
+ * @chan:	DMA channel
+ * @maxsize:	allocated size of buffer (in bytes)
+ * @size:	transfer size (in bytes)
+ * @mode:	DMA_MODE_READ or DMA_MODE_WRITE
+ */
+struct comedi_isadma_desc {
+	void *virt_addr;
+	dma_addr_t hw_addr;
+	unsigned int chan;
+	unsigned int maxsize;
+	unsigned int size;
+	char mode;
+};
+
+/**
+ * struct comedi_isadma - ISA DMA data
+ * @dev:	device to allocate non-coherent memory for
+ * @desc:	cookie for each DMA buffer
+ * @n_desc:	the number of cookies
+ * @cur_dma:	the current cookie in use
+ * @chan:	the first DMA channel requested
+ * @chan2:	the second DMA channel requested
+ */
+struct comedi_isadma {
+	struct device *dev;
+	struct comedi_isadma_desc *desc;
+	int n_desc;
+	int cur_dma;
+	unsigned int chan;
+	unsigned int chan2;
+};
+
+#if IS_ENABLED(CONFIG_ISA_DMA_API)
+
+void comedi_isadma_program(struct comedi_isadma_desc *desc);
+unsigned int comedi_isadma_disable(unsigned int dma_chan);
+unsigned int comedi_isadma_disable_on_sample(unsigned int dma_chan,
+					     unsigned int size);
+unsigned int comedi_isadma_poll(struct comedi_isadma *dma);
+void comedi_isadma_set_mode(struct comedi_isadma_desc *desc, char dma_dir);
+
+struct comedi_isadma *comedi_isadma_alloc(struct comedi_device *dev,
+					  int n_desc, unsigned int dma_chan1,
+					  unsigned int dma_chan2,
+					  unsigned int maxsize, char dma_dir);
+void comedi_isadma_free(struct comedi_isadma *dma);
+
+#else	/* !IS_ENABLED(CONFIG_ISA_DMA_API) */
+
+static inline void comedi_isadma_program(struct comedi_isadma_desc *desc)
+{
+}
+
+static inline unsigned int comedi_isadma_disable(unsigned int dma_chan)
+{
+	return 0;
+}
+
+static inline unsigned int
+comedi_isadma_disable_on_sample(unsigned int dma_chan, unsigned int size)
+{
+	return 0;
+}
+
+static inline unsigned int comedi_isadma_poll(struct comedi_isadma *dma)
+{
+	return 0;
+}
+
+static inline void comedi_isadma_set_mode(struct comedi_isadma_desc *desc,
+					  char dma_dir)
+{
+}
+
+static inline struct comedi_isadma *
+comedi_isadma_alloc(struct comedi_device *dev, int n_desc,
+		    unsigned int dma_chan1, unsigned int dma_chan2,
+		    unsigned int maxsize, char dma_dir)
+{
+	return NULL;
+}
+
+static inline void comedi_isadma_free(struct comedi_isadma *dma)
+{
+}
+
+#endif	/* !IS_ENABLED(CONFIG_ISA_DMA_API) */
+
+#endif	/* #ifndef _COMEDI_ISADMA_H */
-- 
cgit v1.2.3


From 2cca3465147d650be3de04927a99784b30251ade Mon Sep 17 00:00:00 2001
From: Alexander Usyskin <alexander.usyskin@intel.com>
Date: Fri, 12 Nov 2021 08:28:09 +0200
Subject: mei: bus: add client dma interface

Expose the client dma mapping via mei client bus interface.
The client dma has to be mapped before the device is enabled,
therefore we need to create device linking already during mapping
and we need to unmap after the client is disable hence we need to
postpone the unlink and flush till unmapping or when
destroying the device.

Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
Co-developed-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20210420172755.12178-1-emmanuel.grumbach@intel.com
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Link: https://lore.kernel.org/r/20211112062814.7502-1-emmanuel.grumbach@intel.com
---
 include/linux/mei_cl_bus.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index c6786c12b207..df1fab44ea5c 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -117,4 +117,7 @@ int mei_cldev_enable(struct mei_cl_device *cldev);
 int mei_cldev_disable(struct mei_cl_device *cldev);
 bool mei_cldev_enabled(const struct mei_cl_device *cldev);
 
+void *mei_cldev_dma_map(struct mei_cl_device *cldev, u8 buffer_id, size_t size);
+int mei_cldev_dma_unmap(struct mei_cl_device *cldev);
+
 #endif /* _LINUX_MEI_CL_BUS_H */
-- 
cgit v1.2.3


From ec15baec3272bbec576f2ce7ce47765a8e9b7b1c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 26 Nov 2021 19:28:43 +0200
Subject: net: ptp: add a definition for the UDP port for IEEE 1588 general
 messages

As opposed to event messages (Sync, PdelayReq etc) which require
timestamping, general messages (Announce, FollowUp etc) do not.
In PTP they are part of different streams of data.

IEEE 1588-2008 Annex D.2 "UDP port numbers" states that the UDP
destination port assigned by IANA is 319 for event messages, and 320 for
general messages. Yet the kernel seems to be missing the definition for
general messages. This patch adds it.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ptp_classify.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h
index ae04968a3a47..9afd34a2d36c 100644
--- a/include/linux/ptp_classify.h
+++ b/include/linux/ptp_classify.h
@@ -37,6 +37,7 @@
 #define PTP_MSGTYPE_PDELAY_RESP 0x3
 
 #define PTP_EV_PORT 319
+#define PTP_GEN_PORT 320
 #define PTP_GEN_BIT 0x08 /* indicates general message, if set in message type */
 
 #define OFF_PTP_SOURCE_UUID	22 /* PTPv1 only */
-- 
cgit v1.2.3


From a6914afcdf0e3fb853fce0e0c04710be7427b62f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 10 Nov 2021 12:31:28 +0200
Subject: kobject: Replace kernel.h with the necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20211110103128.59888-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kobject.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index efd56f990a46..c740062b4b1a 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -19,10 +19,10 @@
 #include <linux/list.h>
 #include <linux/sysfs.h>
 #include <linux/compiler.h>
+#include <linux/container_of.h>
 #include <linux/spinlock.h>
 #include <linux/kref.h>
 #include <linux/kobject_ns.h>
-#include <linux/kernel.h>
 #include <linux/wait.h>
 #include <linux/atomic.h>
 #include <linux/workqueue.h>
-- 
cgit v1.2.3


From 6a2d2ddf2c345e0149bfbffdddc4768a9ab0a741 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javierm@redhat.com>
Date: Fri, 12 Nov 2021 14:32:27 +0100
Subject: drm: Move nomodeset kernel parameter to the DRM subsystem

The "nomodeset" kernel cmdline parameter is handled by the vgacon driver
but the exported vgacon_text_force() symbol is only used by DRM drivers.

It makes much more sense for the parameter logic to be in the subsystem
of the drivers that are making use of it.

Let's move the vgacon_text_force() function and related logic to the DRM
subsystem. While doing that, rename it to drm_firmware_drivers_only() and
make it return true if "nomodeset" was used and false otherwise. This is
a better description of the condition that the drivers are testing for.

Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20211112133230.1595307-4-javierm@redhat.com
---
 include/linux/console.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index a97f277cfdfa..7cd758a4f44e 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -219,12 +219,6 @@ extern atomic_t ignore_console_lock_warning;
 #define VESA_HSYNC_SUSPEND      2
 #define VESA_POWERDOWN          3
 
-#ifdef CONFIG_VGA_CONSOLE
-extern bool vgacon_text_force(void);
-#else
-static inline bool vgacon_text_force(void) { return false; }
-#endif
-
 extern void console_init(void);
 
 /* For deferred console takeover */
-- 
cgit v1.2.3


From ed14e769f64311769dcf20dde544b82c158d01b1 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Mon, 15 Nov 2021 14:19:12 +0000
Subject: iio: buffer-dma: Remove unused iio_buffer_block struct

This structure was never used anywhere, so it can safely be dropped.

It will later be re-introduced as a different structure in a
different header.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Link: https://lore.kernel.org/r/20211115141925.60164-3-paul@crapouillou.net
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/buffer-dma.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h
index ff15c61bf319..6564bdcdac66 100644
--- a/include/linux/iio/buffer-dma.h
+++ b/include/linux/iio/buffer-dma.h
@@ -17,11 +17,6 @@ struct iio_dma_buffer_queue;
 struct iio_dma_buffer_ops;
 struct device;
 
-struct iio_buffer_block {
-	u32 size;
-	u32 bytes_used;
-};
-
 /**
  * enum iio_block_state - State of a struct iio_dma_buffer_block
  * @IIO_BLOCK_STATE_DEQUEUED: Block is not queued
-- 
cgit v1.2.3


From ffc7c5172a6d1f7ec468066a7172ce65baf1e3e1 Mon Sep 17 00:00:00 2001
From: Antoniu Miclaus <antoniu.miclaus@analog.com>
Date: Fri, 19 Nov 2021 10:56:27 +0200
Subject: iio: expose shared parameter in IIO_ENUM_AVAILABLE

The shared parameter should be configurable based on its usage, and not
constrained to IIO_SHARED_BY_TYPE.

This patch aims to improve the flexibility in using the
IIO_ENUM_AVAILABLE define and avoid redefining custom iio enums that
expose the shared parameter.

An example is the ad5766.c driver where IIO_ENUM_AVAILABLE_SHARED was
defined in order to achieve `shared` parameter customization.

The current state of the IIO_ENUM_AVAILABLE implementation will imply
similar redefinitions each time a driver will require access to the
`shared` parameter. An example would be admv1013 driver which will
require custom device attribute for the frequency translation modes:
Quadrature I/Q mode and Intermediate Frequency mode.

Signed-off-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Link: https://lore.kernel.org/r/20211119085627.6348-1-antoniu.miclaus@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 324561b7a5e8..07025d6b3de1 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -103,15 +103,16 @@ ssize_t iio_enum_write(struct iio_dev *indio_dev,
 /**
  * IIO_ENUM_AVAILABLE() - Initialize enum available extended channel attribute
  * @_name:	Attribute name ("_available" will be appended to the name)
+ * @_shared:	Whether the attribute is shared between all channels
  * @_e:		Pointer to an iio_enum struct
  *
  * Creates a read only attribute which lists all the available enum items in a
  * space separated list. This should usually be used together with IIO_ENUM()
  */
-#define IIO_ENUM_AVAILABLE(_name, _e) \
+#define IIO_ENUM_AVAILABLE(_name, _shared, _e) \
 { \
 	.name = (_name "_available"), \
-	.shared = IIO_SHARED_BY_TYPE, \
+	.shared = _shared, \
 	.read = iio_enum_available_read, \
 	.private = (uintptr_t)(_e), \
 }
-- 
cgit v1.2.3


From 75c5bd68b699bbcb6d25879644d62de4da14ab92 Mon Sep 17 00:00:00 2001
From: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Date: Fri, 26 Nov 2021 10:48:19 +0100
Subject: ieee80211: change HE nominal packet padding value defines

It's easier to use and understand, and to extend for EHT later,
if we use the values here instead of the shifted values.

Unfortunately, we need to add _POS so that we can use it in
places like iwlwifi/mvm where constants are needed.

While at it, fix the typo ("NOMIMAL") which also helps catch any
conflicts.

Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://lore.kernel.org/r/20211126104817.7c29a05b8eb5.I2ca9faf06e177e3035bec91e2ae53c2f91d41774@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 11d7af260f20..559b6c644938 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2258,11 +2258,12 @@ enum ieee80211_client_reg_power {
 #define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU		0x08
 #define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB	0x10
 #define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB	0x20
-#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_0US			0x00
-#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_8US			0x40
-#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_16US			0x80
-#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_RESERVED		0xc0
-#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_MASK			0xc0
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US			0x0
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US			0x1
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US			0x2
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED		0x3
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_POS			6
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK			0xc0
 
 #define IEEE80211_HE_PHY_CAP10_HE_MU_M1RU_MAX_LTF			0x01
 
-- 
cgit v1.2.3


From 4ba0b2c294fe691921271372f7b59e5cc2ce4b0f Mon Sep 17 00:00:00 2001
From: Russ Weight <russell.h.weight@intel.com>
Date: Thu, 18 Nov 2021 17:55:51 -0800
Subject: fpga: mgr: Use standard dev_release for class driver

The FPGA manager class driver data structure is being treated as a
managed resource instead of using the standard dev_release call-back
function to release the class data structure. This change removes
the managed resource code for the freeing of the class data structure
and combines the create() and register() functions into a single
register() or register_full() function.

The register_full() function accepts an info data structure to provide
flexibility in passing optional parameters. The register() function
supports the current parameter list for users that don't require the
use of optional parameters.

The devm_fpga_mgr_register() function is retained, and the
devm_fpga_mgr_register_full() function is added.

Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Reviewed-by: Xu Yilun <yilun.xu@intel.com>
Acked-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
---
 include/linux/fpga/fpga-mgr.h | 62 ++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index 474c1f506307..0f9468771bb9 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -105,6 +105,36 @@ struct fpga_image_info {
 #endif
 };
 
+/**
+ * struct fpga_compat_id - id for compatibility check
+ *
+ * @id_h: high 64bit of the compat_id
+ * @id_l: low 64bit of the compat_id
+ */
+struct fpga_compat_id {
+	u64 id_h;
+	u64 id_l;
+};
+
+/**
+ * struct fpga_manager_info - collection of parameters for an FPGA Manager
+ * @name: fpga manager name
+ * @compat_id: FPGA manager id for compatibility check.
+ * @mops: pointer to structure of fpga manager ops
+ * @priv: fpga manager private data
+ *
+ * fpga_manager_info contains parameters for the register_full function.
+ * These are separated into an info structure because they some are optional
+ * others could be added to in the future. The info structure facilitates
+ * maintaining a stable API.
+ */
+struct fpga_manager_info {
+	const char *name;
+	struct fpga_compat_id *compat_id;
+	const struct fpga_manager_ops *mops;
+	void *priv;
+};
+
 /**
  * struct fpga_manager_ops - ops for low level fpga manager drivers
  * @initial_header_size: Maximum number of bytes that should be passed into write_init
@@ -143,17 +173,6 @@ struct fpga_manager_ops {
 #define FPGA_MGR_STATUS_IP_PROTOCOL_ERR		BIT(3)
 #define FPGA_MGR_STATUS_FIFO_OVERFLOW_ERR	BIT(4)
 
-/**
- * struct fpga_compat_id - id for compatibility check
- *
- * @id_h: high 64bit of the compat_id
- * @id_l: low 64bit of the compat_id
- */
-struct fpga_compat_id {
-	u64 id_h;
-	u64 id_l;
-};
-
 /**
  * struct fpga_manager - fpga manager structure
  * @name: name of low level fpga manager
@@ -191,17 +210,18 @@ struct fpga_manager *fpga_mgr_get(struct device *dev);
 
 void fpga_mgr_put(struct fpga_manager *mgr);
 
-struct fpga_manager *fpga_mgr_create(struct device *dev, const char *name,
-				     const struct fpga_manager_ops *mops,
-				     void *priv);
-void fpga_mgr_free(struct fpga_manager *mgr);
-int fpga_mgr_register(struct fpga_manager *mgr);
-void fpga_mgr_unregister(struct fpga_manager *mgr);
+struct fpga_manager *
+fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info);
 
-int devm_fpga_mgr_register(struct device *dev, struct fpga_manager *mgr);
+struct fpga_manager *
+fpga_mgr_register(struct device *parent, const char *name,
+		  const struct fpga_manager_ops *mops, void *priv);
+void fpga_mgr_unregister(struct fpga_manager *mgr);
 
-struct fpga_manager *devm_fpga_mgr_create(struct device *dev, const char *name,
-					  const struct fpga_manager_ops *mops,
-					  void *priv);
+struct fpga_manager *
+devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info);
+struct fpga_manager *
+devm_fpga_mgr_register(struct device *parent, const char *name,
+		       const struct fpga_manager_ops *mops, void *priv);
 
 #endif /*_LINUX_FPGA_MGR_H */
-- 
cgit v1.2.3


From 0d70af3c2530a70f1b2c197feaa63fbd3548ce34 Mon Sep 17 00:00:00 2001
From: Russ Weight <russell.h.weight@intel.com>
Date: Thu, 18 Nov 2021 17:55:52 -0800
Subject: fpga: bridge: Use standard dev_release for class driver

The FPGA bridge class driver data structure is being treated as a
managed resource instead of using the standard dev_release call-back
function to release the class data structure. This change removes
the managed resource code and combines the create() and register()
functions into a single register() function.

Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Reviewed-by: Xu Yilun <yilun.xu@intel.com>
Acked-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
---
 include/linux/fpga/fpga-bridge.h | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fpga/fpga-bridge.h b/include/linux/fpga/fpga-bridge.h
index 6c3c28806ff1..223da48a6d18 100644
--- a/include/linux/fpga/fpga-bridge.h
+++ b/include/linux/fpga/fpga-bridge.h
@@ -22,6 +22,23 @@ struct fpga_bridge_ops {
 	const struct attribute_group **groups;
 };
 
+/**
+ * struct fpga_bridge_info - collection of parameters an FPGA Bridge
+ * @name: fpga bridge name
+ * @br_ops: pointer to structure of fpga bridge ops
+ * @priv: fpga bridge private data
+ *
+ * fpga_bridge_info contains parameters for the register function. These
+ * are separated into an info structure because they some are optional
+ * others could be added to in the future. The info structure facilitates
+ * maintaining a stable API.
+ */
+struct fpga_bridge_info {
+	const char *name;
+	const struct fpga_bridge_ops *br_ops;
+	void *priv;
+};
+
 /**
  * struct fpga_bridge - FPGA bridge structure
  * @name: name of low level FPGA bridge
@@ -62,15 +79,10 @@ int of_fpga_bridge_get_to_list(struct device_node *np,
 			       struct fpga_image_info *info,
 			       struct list_head *bridge_list);
 
-struct fpga_bridge *fpga_bridge_create(struct device *dev, const char *name,
-				       const struct fpga_bridge_ops *br_ops,
-				       void *priv);
-void fpga_bridge_free(struct fpga_bridge *br);
-int fpga_bridge_register(struct fpga_bridge *br);
+struct fpga_bridge *
+fpga_bridge_register(struct device *parent, const char *name,
+		     const struct fpga_bridge_ops *br_ops,
+		     void *priv);
 void fpga_bridge_unregister(struct fpga_bridge *br);
 
-struct fpga_bridge
-*devm_fpga_bridge_create(struct device *dev, const char *name,
-			 const struct fpga_bridge_ops *br_ops, void *priv);
-
 #endif /* _LINUX_FPGA_BRIDGE_H */
-- 
cgit v1.2.3


From 8886a579744fbfa53e69aa453ed10ae3b1f9abac Mon Sep 17 00:00:00 2001
From: Russ Weight <russell.h.weight@intel.com>
Date: Thu, 18 Nov 2021 17:55:53 -0800
Subject: fpga: region: Use standard dev_release for class driver

The FPGA region class driver data structure is being treated as a
managed resource instead of using the standard dev_release call-back
function to release the class data structure. This change removes the
managed resource code and combines the create() and register()
functions into a single register() or register_full() function.

The register_full() function accepts an info data structure to provide
flexibility in passing optional parameters. The register() function
supports the current parameter list for users that don't require the
use of optional parameters.

Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Reviewed-by: Xu Yilun <yilun.xu@intel.com>
Acked-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
---
 include/linux/fpga/fpga-region.h | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fpga/fpga-region.h b/include/linux/fpga/fpga-region.h
index 27cb706275db..3b87f232425c 100644
--- a/include/linux/fpga/fpga-region.h
+++ b/include/linux/fpga/fpga-region.h
@@ -7,6 +7,27 @@
 #include <linux/fpga/fpga-mgr.h>
 #include <linux/fpga/fpga-bridge.h>
 
+struct fpga_region;
+
+/**
+ * struct fpga_region_info - collection of parameters an FPGA Region
+ * @mgr: fpga region manager
+ * @compat_id: FPGA region id for compatibility check.
+ * @priv: fpga region private data
+ * @get_bridges: optional function to get bridges to a list
+ *
+ * fpga_region_info contains parameters for the register_full function.
+ * These are separated into an info structure because they some are optional
+ * others could be added to in the future. The info structure facilitates
+ * maintaining a stable API.
+ */
+struct fpga_region_info {
+	struct fpga_manager *mgr;
+	struct fpga_compat_id *compat_id;
+	void *priv;
+	int (*get_bridges)(struct fpga_region *region);
+};
+
 /**
  * struct fpga_region - FPGA Region structure
  * @dev: FPGA Region device
@@ -37,15 +58,12 @@ struct fpga_region *fpga_region_class_find(
 
 int fpga_region_program_fpga(struct fpga_region *region);
 
-struct fpga_region
-*fpga_region_create(struct device *dev, struct fpga_manager *mgr,
-		    int (*get_bridges)(struct fpga_region *));
-void fpga_region_free(struct fpga_region *region);
-int fpga_region_register(struct fpga_region *region);
-void fpga_region_unregister(struct fpga_region *region);
+struct fpga_region *
+fpga_region_register_full(struct device *parent, const struct fpga_region_info *info);
 
-struct fpga_region
-*devm_fpga_region_create(struct device *dev, struct fpga_manager *mgr,
-			int (*get_bridges)(struct fpga_region *));
+struct fpga_region *
+fpga_region_register(struct device *parent, struct fpga_manager *mgr,
+		     int (*get_bridges)(struct fpga_region *));
+void fpga_region_unregister(struct fpga_region *region);
 
 #endif /* _FPGA_REGION_H */
-- 
cgit v1.2.3


From 306456c21c792ac633e660bc45f0854b612a0e98 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Tue, 16 Nov 2021 14:54:35 +0200
Subject: mfd: bd70528: Drop BD70528 support

The only known BD70528 use-cases are such that the PMIC is controlled
from separate MCU which is not running Linux. I am not aware of
any Linux driver users. Furthermore, it seems there is no demand for
this IC. Let's ease the maintenance burden and drop the driver. We can
always add it back if there is sudden need for it.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/cf7dfd98b3403ad363b2b48b57bdbfd57a6416cb.1637066805.git.matti.vaittinen@fi.rohmeurope.com
---
 include/linux/mfd/rohm-bd70528.h | 389 ---------------------------------------
 include/linux/mfd/rohm-generic.h |   1 -
 2 files changed, 390 deletions(-)
 delete mode 100644 include/linux/mfd/rohm-bd70528.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/rohm-bd70528.h b/include/linux/mfd/rohm-bd70528.h
deleted file mode 100644
index 4a5966475a35..000000000000
--- a/include/linux/mfd/rohm-bd70528.h
+++ /dev/null
@@ -1,389 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Copyright (C) 2018 ROHM Semiconductors */
-
-#ifndef __LINUX_MFD_BD70528_H__
-#define __LINUX_MFD_BD70528_H__
-
-#include <linux/bits.h>
-#include <linux/device.h>
-#include <linux/mfd/rohm-generic.h>
-#include <linux/mfd/rohm-shared.h>
-#include <linux/regmap.h>
-
-enum {
-	BD70528_BUCK1,
-	BD70528_BUCK2,
-	BD70528_BUCK3,
-	BD70528_LDO1,
-	BD70528_LDO2,
-	BD70528_LDO3,
-	BD70528_LED1,
-	BD70528_LED2,
-};
-
-struct bd70528_data {
-	struct rohm_regmap_dev chip;
-	struct mutex rtc_timer_lock;
-};
-
-#define BD70528_BUCK_VOLTS 0x10
-#define BD70528_LDO_VOLTS 0x20
-
-#define BD70528_REG_BUCK1_EN	0x0F
-#define BD70528_REG_BUCK1_VOLT	0x15
-#define BD70528_REG_BUCK2_EN	0x10
-#define BD70528_REG_BUCK2_VOLT	0x16
-#define BD70528_REG_BUCK3_EN	0x11
-#define BD70528_REG_BUCK3_VOLT	0x17
-#define BD70528_REG_LDO1_EN	0x1b
-#define BD70528_REG_LDO1_VOLT	0x1e
-#define BD70528_REG_LDO2_EN	0x1c
-#define BD70528_REG_LDO2_VOLT	0x1f
-#define BD70528_REG_LDO3_EN	0x1d
-#define BD70528_REG_LDO3_VOLT	0x20
-#define BD70528_REG_LED_CTRL	0x2b
-#define BD70528_REG_LED_VOLT	0x29
-#define BD70528_REG_LED_EN	0x2a
-
-/* main irq registers */
-#define BD70528_REG_INT_MAIN	0x7E
-#define BD70528_REG_INT_MAIN_MASK 0x74
-
-/* 'sub irq' registers */
-#define BD70528_REG_INT_SHDN	0x7F
-#define BD70528_REG_INT_PWR_FLT	0x80
-#define BD70528_REG_INT_VR_FLT	0x81
-#define BD70528_REG_INT_MISC	0x82
-#define BD70528_REG_INT_BAT1	0x83
-#define BD70528_REG_INT_BAT2	0x84
-#define BD70528_REG_INT_RTC	0x85
-#define BD70528_REG_INT_GPIO	0x86
-#define BD70528_REG_INT_OP_FAIL	0x87
-
-#define BD70528_REG_INT_SHDN_MASK	0x75
-#define BD70528_REG_INT_PWR_FLT_MASK	0x76
-#define BD70528_REG_INT_VR_FLT_MASK	0x77
-#define BD70528_REG_INT_MISC_MASK	0x78
-#define BD70528_REG_INT_BAT1_MASK	0x79
-#define BD70528_REG_INT_BAT2_MASK	0x7a
-#define BD70528_REG_INT_RTC_MASK	0x7b
-#define BD70528_REG_INT_GPIO_MASK	0x7c
-#define BD70528_REG_INT_OP_FAIL_MASK	0x7d
-
-/* Reset related 'magic' registers */
-#define BD70528_REG_SHIPMODE	0x03
-#define BD70528_REG_HWRESET	0x04
-#define BD70528_REG_WARMRESET	0x05
-#define BD70528_REG_STANDBY	0x06
-
-/* GPIO registers */
-#define BD70528_REG_GPIO_STATE	0x8F
-
-#define BD70528_REG_GPIO1_IN	0x4d
-#define BD70528_REG_GPIO2_IN	0x4f
-#define BD70528_REG_GPIO3_IN	0x51
-#define BD70528_REG_GPIO4_IN	0x53
-#define BD70528_REG_GPIO1_OUT	0x4e
-#define BD70528_REG_GPIO2_OUT	0x50
-#define BD70528_REG_GPIO3_OUT	0x52
-#define BD70528_REG_GPIO4_OUT	0x54
-
-/* RTC */
-
-#define BD70528_REG_RTC_COUNT_H		0x2d
-#define BD70528_REG_RTC_COUNT_L		0x2e
-#define BD70528_REG_RTC_SEC		0x2f
-#define BD70528_REG_RTC_MINUTE		0x30
-#define BD70528_REG_RTC_HOUR		0x31
-#define BD70528_REG_RTC_WEEK		0x32
-#define BD70528_REG_RTC_DAY		0x33
-#define BD70528_REG_RTC_MONTH		0x34
-#define BD70528_REG_RTC_YEAR		0x35
-
-#define BD70528_REG_RTC_ALM_SEC		0x36
-#define BD70528_REG_RTC_ALM_START	BD70528_REG_RTC_ALM_SEC
-#define BD70528_REG_RTC_ALM_MINUTE	0x37
-#define BD70528_REG_RTC_ALM_HOUR	0x38
-#define BD70528_REG_RTC_ALM_WEEK	0x39
-#define BD70528_REG_RTC_ALM_DAY		0x3a
-#define BD70528_REG_RTC_ALM_MONTH	0x3b
-#define BD70528_REG_RTC_ALM_YEAR	0x3c
-#define BD70528_REG_RTC_ALM_MASK	0x3d
-#define BD70528_REG_RTC_ALM_REPEAT	0x3e
-#define BD70528_REG_RTC_START		BD70528_REG_RTC_SEC
-
-#define BD70528_REG_RTC_WAKE_SEC	0x43
-#define BD70528_REG_RTC_WAKE_START	BD70528_REG_RTC_WAKE_SEC
-#define BD70528_REG_RTC_WAKE_MIN	0x44
-#define BD70528_REG_RTC_WAKE_HOUR	0x45
-#define BD70528_REG_RTC_WAKE_CTRL	0x46
-
-#define BD70528_REG_ELAPSED_TIMER_EN	0x42
-#define BD70528_REG_WAKE_EN		0x46
-
-/* WDT registers */
-#define BD70528_REG_WDT_CTRL		0x4A
-#define BD70528_REG_WDT_HOUR		0x49
-#define BD70528_REG_WDT_MINUTE		0x48
-#define BD70528_REG_WDT_SEC		0x47
-
-/* Charger / Battery */
-#define BD70528_REG_CHG_CURR_STAT	0x59
-#define BD70528_REG_CHG_BAT_STAT	0x57
-#define BD70528_REG_CHG_BAT_TEMP	0x58
-#define BD70528_REG_CHG_IN_STAT		0x56
-#define BD70528_REG_CHG_DCIN_ILIM	0x5d
-#define BD70528_REG_CHG_CHG_CURR_WARM	0x61
-#define BD70528_REG_CHG_CHG_CURR_COLD	0x62
-
-/* Masks for main IRQ register bits */
-enum {
-	BD70528_INT_SHDN,
-#define BD70528_INT_SHDN_MASK BIT(BD70528_INT_SHDN)
-	BD70528_INT_PWR_FLT,
-#define BD70528_INT_PWR_FLT_MASK BIT(BD70528_INT_PWR_FLT)
-	BD70528_INT_VR_FLT,
-#define BD70528_INT_VR_FLT_MASK BIT(BD70528_INT_VR_FLT)
-	BD70528_INT_MISC,
-#define BD70528_INT_MISC_MASK BIT(BD70528_INT_MISC)
-	BD70528_INT_BAT1,
-#define BD70528_INT_BAT1_MASK BIT(BD70528_INT_BAT1)
-	BD70528_INT_RTC,
-#define BD70528_INT_RTC_MASK BIT(BD70528_INT_RTC)
-	BD70528_INT_GPIO,
-#define BD70528_INT_GPIO_MASK BIT(BD70528_INT_GPIO)
-	BD70528_INT_OP_FAIL,
-#define BD70528_INT_OP_FAIL_MASK BIT(BD70528_INT_OP_FAIL)
-};
-
-/* IRQs */
-enum {
-	/* Shutdown register IRQs */
-	BD70528_INT_LONGPUSH,
-	BD70528_INT_WDT,
-	BD70528_INT_HWRESET,
-	BD70528_INT_RSTB_FAULT,
-	BD70528_INT_VBAT_UVLO,
-	BD70528_INT_TSD,
-	BD70528_INT_RSTIN,
-	/* Power failure register IRQs */
-	BD70528_INT_BUCK1_FAULT,
-	BD70528_INT_BUCK2_FAULT,
-	BD70528_INT_BUCK3_FAULT,
-	BD70528_INT_LDO1_FAULT,
-	BD70528_INT_LDO2_FAULT,
-	BD70528_INT_LDO3_FAULT,
-	BD70528_INT_LED1_FAULT,
-	BD70528_INT_LED2_FAULT,
-	/* VR FAULT register IRQs */
-	BD70528_INT_BUCK1_OCP,
-	BD70528_INT_BUCK2_OCP,
-	BD70528_INT_BUCK3_OCP,
-	BD70528_INT_LED1_OCP,
-	BD70528_INT_LED2_OCP,
-	BD70528_INT_BUCK1_FULLON,
-	BD70528_INT_BUCK2_FULLON,
-	/* PMU register interrupts */
-	BD70528_INT_SHORTPUSH,
-	BD70528_INT_AUTO_WAKEUP,
-	BD70528_INT_STATE_CHANGE,
-	/* Charger 1 register IRQs */
-	BD70528_INT_BAT_OV_RES,
-	BD70528_INT_BAT_OV_DET,
-	BD70528_INT_DBAT_DET,
-	BD70528_INT_BATTSD_COLD_RES,
-	BD70528_INT_BATTSD_COLD_DET,
-	BD70528_INT_BATTSD_HOT_RES,
-	BD70528_INT_BATTSD_HOT_DET,
-	BD70528_INT_CHG_TSD,
-	/* Charger 2 register IRQs */
-	BD70528_INT_BAT_RMV,
-	BD70528_INT_BAT_DET,
-	BD70528_INT_DCIN2_OV_RES,
-	BD70528_INT_DCIN2_OV_DET,
-	BD70528_INT_DCIN2_RMV,
-	BD70528_INT_DCIN2_DET,
-	BD70528_INT_DCIN1_RMV,
-	BD70528_INT_DCIN1_DET,
-	/* RTC register IRQs */
-	BD70528_INT_RTC_ALARM,
-	BD70528_INT_ELPS_TIM,
-	/* GPIO register IRQs */
-	BD70528_INT_GPIO0,
-	BD70528_INT_GPIO1,
-	BD70528_INT_GPIO2,
-	BD70528_INT_GPIO3,
-	/* Invalid operation register IRQs */
-	BD70528_INT_BUCK1_DVS_OPFAIL,
-	BD70528_INT_BUCK2_DVS_OPFAIL,
-	BD70528_INT_BUCK3_DVS_OPFAIL,
-	BD70528_INT_LED1_VOLT_OPFAIL,
-	BD70528_INT_LED2_VOLT_OPFAIL,
-};
-
-/* Masks */
-#define BD70528_INT_LONGPUSH_MASK 0x1
-#define BD70528_INT_WDT_MASK 0x2
-#define BD70528_INT_HWRESET_MASK 0x4
-#define BD70528_INT_RSTB_FAULT_MASK 0x8
-#define BD70528_INT_VBAT_UVLO_MASK 0x10
-#define BD70528_INT_TSD_MASK 0x20
-#define BD70528_INT_RSTIN_MASK 0x40
-
-#define BD70528_INT_BUCK1_FAULT_MASK 0x1
-#define BD70528_INT_BUCK2_FAULT_MASK 0x2
-#define BD70528_INT_BUCK3_FAULT_MASK 0x4
-#define BD70528_INT_LDO1_FAULT_MASK 0x8
-#define BD70528_INT_LDO2_FAULT_MASK 0x10
-#define BD70528_INT_LDO3_FAULT_MASK 0x20
-#define BD70528_INT_LED1_FAULT_MASK 0x40
-#define BD70528_INT_LED2_FAULT_MASK 0x80
-
-#define BD70528_INT_BUCK1_OCP_MASK 0x1
-#define BD70528_INT_BUCK2_OCP_MASK 0x2
-#define BD70528_INT_BUCK3_OCP_MASK 0x4
-#define BD70528_INT_LED1_OCP_MASK 0x8
-#define BD70528_INT_LED2_OCP_MASK 0x10
-#define BD70528_INT_BUCK1_FULLON_MASK 0x20
-#define BD70528_INT_BUCK2_FULLON_MASK 0x40
-
-#define BD70528_INT_SHORTPUSH_MASK 0x1
-#define BD70528_INT_AUTO_WAKEUP_MASK 0x2
-#define BD70528_INT_STATE_CHANGE_MASK 0x10
-
-#define BD70528_INT_BAT_OV_RES_MASK 0x1
-#define BD70528_INT_BAT_OV_DET_MASK 0x2
-#define BD70528_INT_DBAT_DET_MASK 0x4
-#define BD70528_INT_BATTSD_COLD_RES_MASK 0x8
-#define BD70528_INT_BATTSD_COLD_DET_MASK 0x10
-#define BD70528_INT_BATTSD_HOT_RES_MASK 0x20
-#define BD70528_INT_BATTSD_HOT_DET_MASK 0x40
-#define BD70528_INT_CHG_TSD_MASK 0x80
-
-#define BD70528_INT_BAT_RMV_MASK 0x1
-#define BD70528_INT_BAT_DET_MASK 0x2
-#define BD70528_INT_DCIN2_OV_RES_MASK 0x4
-#define BD70528_INT_DCIN2_OV_DET_MASK 0x8
-#define BD70528_INT_DCIN2_RMV_MASK 0x10
-#define BD70528_INT_DCIN2_DET_MASK 0x20
-#define BD70528_INT_DCIN1_RMV_MASK 0x40
-#define BD70528_INT_DCIN1_DET_MASK 0x80
-
-#define BD70528_INT_RTC_ALARM_MASK 0x1
-#define BD70528_INT_ELPS_TIM_MASK 0x2
-
-#define BD70528_INT_GPIO0_MASK 0x1
-#define BD70528_INT_GPIO1_MASK 0x2
-#define BD70528_INT_GPIO2_MASK 0x4
-#define BD70528_INT_GPIO3_MASK 0x8
-
-#define BD70528_INT_BUCK1_DVS_OPFAIL_MASK 0x1
-#define BD70528_INT_BUCK2_DVS_OPFAIL_MASK 0x2
-#define BD70528_INT_BUCK3_DVS_OPFAIL_MASK 0x4
-#define BD70528_INT_LED1_VOLT_OPFAIL_MASK 0x10
-#define BD70528_INT_LED2_VOLT_OPFAIL_MASK 0x20
-
-#define BD70528_DEBOUNCE_MASK 0x3
-
-#define BD70528_DEBOUNCE_DISABLE 0
-#define BD70528_DEBOUNCE_15MS 1
-#define BD70528_DEBOUNCE_30MS 2
-#define BD70528_DEBOUNCE_50MS 3
-
-#define BD70528_GPIO_DRIVE_MASK 0x2
-#define BD70528_GPIO_PUSH_PULL 0x0
-#define BD70528_GPIO_OPEN_DRAIN 0x2
-
-#define BD70528_GPIO_OUT_EN_MASK 0x80
-#define BD70528_GPIO_OUT_ENABLE 0x80
-#define BD70528_GPIO_OUT_DISABLE 0x0
-
-#define BD70528_GPIO_OUT_HI 0x1
-#define BD70528_GPIO_OUT_LO 0x0
-#define BD70528_GPIO_OUT_MASK 0x1
-
-#define BD70528_GPIO_IN_STATE_BASE 1
-
-/* RTC masks to mask out reserved bits */
-
-#define BD70528_MASK_ELAPSED_TIMER_EN	0x1
-/* Mask second, min and hour fields
- * HW would support ALM irq for over 24h
- * (by setting day, month and year too)
- * but as we wish to keep this same as for
- * wake-up we limit ALM to 24H and only
- * unmask sec, min and hour
- */
-#define BD70528_MASK_WAKE_EN		0x1
-
-/* WDT masks */
-#define BD70528_MASK_WDT_EN		0x1
-#define BD70528_MASK_WDT_HOUR		0x1
-#define BD70528_MASK_WDT_MINUTE		0x7f
-#define BD70528_MASK_WDT_SEC		0x7f
-
-#define BD70528_WDT_STATE_BIT		0x1
-#define BD70528_ELAPSED_STATE_BIT	0x2
-#define BD70528_WAKE_STATE_BIT		0x4
-
-/* Charger masks */
-#define BD70528_MASK_CHG_STAT		0x7f
-#define BD70528_MASK_CHG_BAT_TIMER	0x20
-#define BD70528_MASK_CHG_BAT_OVERVOLT	0x10
-#define BD70528_MASK_CHG_BAT_DETECT	0x1
-#define BD70528_MASK_CHG_DCIN1_UVLO	0x1
-#define BD70528_MASK_CHG_DCIN_ILIM	0x3f
-#define BD70528_MASK_CHG_CHG_CURR	0x1f
-#define BD70528_MASK_CHG_TRICKLE_CURR	0x10
-
-/*
- * Note, external battery register is the lonely rider at
- * address 0xc5. See how to stuff that in the regmap
- */
-#define BD70528_MAX_REGISTER 0x94
-
-/* Buck control masks */
-#define BD70528_MASK_RUN_EN	0x4
-#define BD70528_MASK_STBY_EN	0x2
-#define BD70528_MASK_IDLE_EN	0x1
-#define BD70528_MASK_LED1_EN	0x1
-#define BD70528_MASK_LED2_EN	0x10
-
-#define BD70528_MASK_BUCK_VOLT	0xf
-#define BD70528_MASK_LDO_VOLT	0x1f
-#define BD70528_MASK_LED1_VOLT	0x1
-#define BD70528_MASK_LED2_VOLT	0x10
-
-/* Misc irq masks */
-#define BD70528_INT_MASK_SHORT_PUSH	1
-#define BD70528_INT_MASK_AUTO_WAKE	2
-#define BD70528_INT_MASK_POWER_STATE	4
-
-#define BD70528_MASK_BUCK_RAMP 0x10
-#define BD70528_SIFT_BUCK_RAMP 4
-
-#if IS_ENABLED(CONFIG_BD70528_WATCHDOG)
-
-int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable, int *old_state);
-void bd70528_wdt_lock(struct rohm_regmap_dev *data);
-void bd70528_wdt_unlock(struct rohm_regmap_dev *data);
-
-#else /* CONFIG_BD70528_WATCHDOG */
-
-static inline int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable,
-				  int *old_state)
-{
-	return 0;
-}
-
-static inline void bd70528_wdt_lock(struct rohm_regmap_dev *data)
-{
-}
-
-static inline void bd70528_wdt_unlock(struct rohm_regmap_dev *data)
-{
-}
-
-#endif /* CONFIG_BD70528_WATCHDOG */
-
-#endif /* __LINUX_MFD_BD70528_H__ */
diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
index 35b392a0d73a..8fb763a2265a 100644
--- a/include/linux/mfd/rohm-generic.h
+++ b/include/linux/mfd/rohm-generic.h
@@ -12,7 +12,6 @@ enum rohm_chip_type {
 	ROHM_CHIP_TYPE_BD9573,
 	ROHM_CHIP_TYPE_BD9574,
 	ROHM_CHIP_TYPE_BD9576,
-	ROHM_CHIP_TYPE_BD70528,
 	ROHM_CHIP_TYPE_BD71815,
 	ROHM_CHIP_TYPE_BD71828,
 	ROHM_CHIP_TYPE_BD71837,
-- 
cgit v1.2.3


From a9c8f68ce2c37ced2f7a8667eda71b7753ede398 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 23 Nov 2021 21:27:22 +0200
Subject: spi: pxa2xx: Get rid of unused ->cs_control()

Since the last user of the custom ->cs_control() gone, we may get rid of
this legacy API completely.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20211123192723.44537-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/pxa2xx_spi.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
index eaab121ee575..42e06bfbc2a4 100644
--- a/include/linux/spi/pxa2xx_spi.h
+++ b/include/linux/spi/pxa2xx_spi.h
@@ -9,9 +9,6 @@
 
 #include <linux/pxa2xx_ssp.h>
 
-#define PXA2XX_CS_ASSERT (0x01)
-#define PXA2XX_CS_DEASSERT (0x02)
-
 struct dma_chan;
 
 /*
@@ -47,7 +44,6 @@ struct pxa2xx_spi_chip {
 	u32 timeout;
 	u8 enable_loopback;
 	int gpio_cs;
-	void (*cs_control)(u32 command);
 };
 
 #if defined(CONFIG_ARCH_PXA) || defined(CONFIG_ARCH_MMP)
-- 
cgit v1.2.3


From 8393961c53b31078cfc877bc00eb0f67e1474edd Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 23 Nov 2021 21:27:23 +0200
Subject: spi: pxa2xx: Get rid of unused enable_loopback member

There is no user of the enable_loopback member in the struct pxa2xx_spi_chip.
Remote this legacy member completely.

The mentioned in the documentation the testing phase can be performed with
spidev_test tool.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20211123192723.44537-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/pxa2xx_spi.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
index 42e06bfbc2a4..ca74dce36706 100644
--- a/include/linux/spi/pxa2xx_spi.h
+++ b/include/linux/spi/pxa2xx_spi.h
@@ -42,7 +42,6 @@ struct pxa2xx_spi_chip {
 	u8 rx_threshold;
 	u8 dma_burst_size;
 	u32 timeout;
-	u8 enable_loopback;
 	int gpio_cs;
 };
 
-- 
cgit v1.2.3


From b99658452355d316debee11079e8f1c6c1029355 Mon Sep 17 00:00:00 2001
From: Colin Foster <colin.foster@in-advantage.com>
Date: Sun, 28 Nov 2021 17:57:37 -0800
Subject: net: dsa: ocelot: felix: utilize shared mscc-miim driver for indirect
 MDIO access

Switch to a shared MDIO access implementation by way of the mdio-mscc-miim
driver.

Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio/mdio-mscc-miim.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 include/linux/mdio/mdio-mscc-miim.h

(limited to 'include/linux')

diff --git a/include/linux/mdio/mdio-mscc-miim.h b/include/linux/mdio/mdio-mscc-miim.h
new file mode 100644
index 000000000000..5b4ed2c3cbb9
--- /dev/null
+++ b/include/linux/mdio/mdio-mscc-miim.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Driver for the MDIO interface of Microsemi network switches.
+ *
+ * Author: Colin Foster <colin.foster@in-advantage.com>
+ * Copyright (C) 2021 Innovative Advantage
+ */
+#ifndef MDIO_MSCC_MIIM_H
+#define MDIO_MSCC_MIIM_H
+
+#include <linux/device.h>
+#include <linux/phy.h>
+#include <linux/regmap.h>
+
+int mscc_miim_setup(struct device *device, struct mii_bus **bus,
+		    const char *name, struct regmap *mii_regmap,
+		    int status_offset);
+
+#endif
-- 
cgit v1.2.3


From 17247821ae9b40ea6df8d771cfca97d91675be93 Mon Sep 17 00:00:00 2001
From: Dario Binacchi <dariobin@libero.it>
Date: Thu, 25 Nov 2021 23:46:42 +0100
Subject: mfd: ti_am335x_tscadc: Drop the CNTRLREG_TSC_8WIRE macro

In TI's reference manual description for the `AFE_Pen_Ctrl' bit-field
of the TSC's CTRL register, there is no mention of 8-wire touchscreens.
Even commit f0933a60d190 ("mfd: ti_am335x_tscadc: Update logic in CTRL
register for 5-wire TS") says that the value of this bit-field must be
the same for 4-wire and 8-wire touchscreens. So let's remove the
CNTRLREG_TSC_8WIRE macro to avoid misunderstandings.

Signed-off-by: Dario Binacchi <dariobin@libero.it>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211125224642.21011-5-dariobin@libero.it
---
 include/linux/mfd/ti_am335x_tscadc.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index ba13e043d910..4063b0614d90 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -103,7 +103,6 @@
 #define CNTRLREG_TSC_AFE_CTRL(val) FIELD_PREP(GENMASK(6, 5), (val))
 #define CNTRLREG_TSC_4WIRE	CNTRLREG_TSC_AFE_CTRL(1)
 #define CNTRLREG_TSC_5WIRE	CNTRLREG_TSC_AFE_CTRL(2)
-#define CNTRLREG_TSC_8WIRE	CNTRLREG_TSC_AFE_CTRL(3)
 #define CNTRLREG_TSC_ENB	BIT(7)
 
 /*Control registers bitfields  for MAGADC IP */
-- 
cgit v1.2.3


From 79478bf9ea9fa48d30836afa796ac13d8a0f320b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:13:54 +0100
Subject: block: move blk_rq_err_bytes to scsi

blk_rq_err_bytes is only used by the scsi midlayer, so move it there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2949d9ac7484..a78d9a0f2a1b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -947,7 +947,6 @@ struct req_iterator {
  * blk_rq_pos()			: the current sector
  * blk_rq_bytes()		: bytes left in the entire request
  * blk_rq_cur_bytes()		: bytes left in the current segment
- * blk_rq_err_bytes()		: bytes left till the next error boundary
  * blk_rq_sectors()		: sectors left in the entire request
  * blk_rq_cur_sectors()		: sectors left in the current segment
  * blk_rq_stats_sectors()	: sectors of the entire request used for stats
@@ -971,8 +970,6 @@ static inline int blk_rq_cur_bytes(const struct request *rq)
 	return bio_iovec(rq->bio).bv_len;
 }
 
-unsigned int blk_rq_err_bytes(const struct request *rq);
-
 static inline unsigned int blk_rq_sectors(const struct request *rq)
 {
 	return blk_rq_bytes(rq) >> SECTOR_SHIFT;
-- 
cgit v1.2.3


From 786d4e01c550e8bb7c9f9f23bca0596a2a33483c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:13:55 +0100
Subject: block: remove rq_flush_dcache_pages

This function is trivial, and flush_dcache_page is always defined, so
just open code it in the 2.5 callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a78d9a0f2a1b..308edc2a4925 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1132,14 +1132,4 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
 }
 #endif /* CONFIG_BLK_DEV_ZONED */
 
-#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-# error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
-#endif
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-void rq_flush_dcache_pages(struct request *rq);
-#else
-static inline void rq_flush_dcache_pages(struct request *rq)
-{
-}
-#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */
 #endif /* BLK_MQ_H */
-- 
cgit v1.2.3


From 86416916466514e4ae0b7296d20133b6427c4c1f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:12 +0100
Subject: block: move GENHD_FL_NATIVE_CAPACITY to disk->state

The flag to indicate an unlocked native capacity is dynamic state,
not a driver capability flag, so move it to disk->state.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 74c410263113..e490a71e5e9d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -60,12 +60,6 @@ struct partition_meta_info {
  * (``BLOCK_EXT_MAJOR``).
  * This affects the maximum number of partitions.
  *
- * ``GENHD_FL_NATIVE_CAPACITY`` (0x0080): based on information in the
- * partition table, the device's capacity has been extended to its
- * native capacity; i.e. the device has hidden capacity used by one
- * of the partitions (this is a flag used so that native capacity is
- * only ever unlocked once).
- *
  * ``GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE`` (0x0100): event polling is
  * blocked whenever a writer holds an exclusive lock.
  *
@@ -86,7 +80,6 @@ struct partition_meta_info {
 #define GENHD_FL_CD				0x0008
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	0x0020
 #define GENHD_FL_EXT_DEVT			0x0040
-#define GENHD_FL_NATIVE_CAPACITY		0x0080
 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE	0x0100
 #define GENHD_FL_NO_PART_SCAN			0x0200
 #define GENHD_FL_HIDDEN				0x0400
@@ -140,6 +133,7 @@ struct gendisk {
 #define GD_NEED_PART_SCAN		0
 #define GD_READ_ONLY			1
 #define GD_DEAD				2
+#define GD_NATIVE_CAPACITY		3
 
 	struct mutex open_mutex;	/* open/close mutex */
 	unsigned open_partitions;	/* number of open partitions */
-- 
cgit v1.2.3


From 1545e0b419ba1d9b9bee4061d4826340afe6b0aa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:13 +0100
Subject: block: move GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE to disk->event_flags

GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE is all about the event reporting
mechanism, so move it to the event_flags field.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e490a71e5e9d..c1136ff3c91f 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -60,9 +60,6 @@ struct partition_meta_info {
  * (``BLOCK_EXT_MAJOR``).
  * This affects the maximum number of partitions.
  *
- * ``GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE`` (0x0100): event polling is
- * blocked whenever a writer holds an exclusive lock.
- *
  * ``GENHD_FL_NO_PART_SCAN`` (0x0200): partition scanning is disabled.
  * Used for loop devices in their default settings and some MMC
  * devices.
@@ -80,7 +77,6 @@ struct partition_meta_info {
 #define GENHD_FL_CD				0x0008
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	0x0020
 #define GENHD_FL_EXT_DEVT			0x0040
-#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE	0x0100
 #define GENHD_FL_NO_PART_SCAN			0x0200
 #define GENHD_FL_HIDDEN				0x0400
 
@@ -94,6 +90,8 @@ enum {
 	DISK_EVENT_FLAG_POLL			= 1 << 0,
 	/* Forward events to udev */
 	DISK_EVENT_FLAG_UEVENT			= 1 << 1,
+	/* Block event polling when open for exclusive write */
+	DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE	= 1 << 2,
 };
 
 struct disk_events;
-- 
cgit v1.2.3


From 1a827ce1b9f2c740d2c6a228afd972970c18bc21 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:14 +0100
Subject: block: remove GENHD_FL_CD

GENHD_FL_CD marks a gendisk as a vaguely CD-ROM like device.
Besides being used internally inside of sunvdc.c an xen-blkfront it
is used by xen-blkback as a hint to claim a device exported to a
guest is a CD-ROM like device.  Just check for disk->cdi instead
which is the right indicator for "real" CD-ROM or DVD drivers.  This
will miss the paravirtualized guest drivers, but those make little
sense to report anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c1136ff3c91f..74518c576fbb 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -46,10 +46,6 @@ struct partition_meta_info {
  * Must not be set for devices which are removed entirely when the
  * media is removed.
  *
- * ``GENHD_FL_CD`` (0x0008): the block device is a CD-ROM-style
- * device.
- * Affects responses to the ``CDROM_GET_CAPABILITY`` ioctl.
- *
  * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include
  * partition information in ``/proc/partitions`` or in the output of
  * printk_all_partitions().
@@ -74,7 +70,6 @@ struct partition_meta_info {
 #define GENHD_FL_REMOVABLE			0x0001
 /* 2 is unused (used to be GENHD_FL_DRIVERFS) */
 /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */
-#define GENHD_FL_CD				0x0008
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	0x0020
 #define GENHD_FL_EXT_DEVT			0x0040
 #define GENHD_FL_NO_PART_SCAN			0x0200
-- 
cgit v1.2.3


From 46e7eac647b34ed4106a8262f8bedbb90801fadd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:17 +0100
Subject: block: rename GENHD_FL_NO_PART_SCAN to GENHD_FL_NO_PART

The GENHD_FL_NO_PART_SCAN controls more than just partitions canning,
so rename it to GENHD_FL_NO_PART.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20211122130625.1136848-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 74518c576fbb..0b9be3df9489 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -56,15 +56,15 @@ struct partition_meta_info {
  * (``BLOCK_EXT_MAJOR``).
  * This affects the maximum number of partitions.
  *
- * ``GENHD_FL_NO_PART_SCAN`` (0x0200): partition scanning is disabled.
- * Used for loop devices in their default settings and some MMC
- * devices.
+ * ``GENHD_FL_NO_PART`` (0x0200): partition support is disabled.
+ * The kernel will not scan for partitions from add_disk, and users
+ * can't add partitions manually.
  *
  * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it
  * doesn't produce events, doesn't appear in sysfs, and doesn't have
  * an associated ``bdev``.
  * Implies ``GENHD_FL_SUPPRESS_PARTITION_INFO`` and
- * ``GENHD_FL_NO_PART_SCAN``.
+ * ``GENHD_FL_NO_PART``.
  * Used for multipath devices.
  */
 #define GENHD_FL_REMOVABLE			0x0001
@@ -72,7 +72,7 @@ struct partition_meta_info {
 /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	0x0020
 #define GENHD_FL_EXT_DEVT			0x0040
-#define GENHD_FL_NO_PART_SCAN			0x0200
+#define GENHD_FL_NO_PART			0x0200
 #define GENHD_FL_HIDDEN				0x0400
 
 enum {
@@ -180,8 +180,7 @@ static inline int disk_max_parts(struct gendisk *disk)
 
 static inline bool disk_part_scan_enabled(struct gendisk *disk)
 {
-	return disk_max_parts(disk) > 1 &&
-		!(disk->flags & GENHD_FL_NO_PART_SCAN);
+	return disk_max_parts(disk) > 1 && !(disk->flags & GENHD_FL_NO_PART);
 }
 
 static inline dev_t disk_devt(struct gendisk *disk)
-- 
cgit v1.2.3


From 3b5149ac50970669ee0ddb9629ec77ffd5c0622d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:21 +0100
Subject: block: remove GENHD_FL_SUPPRESS_PARTITION_INFO

This flag is not set directly anywhere and only inherited from
GENHD_FL_HIDDEN.  Just check for GENHD_FL_HIDDEN instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 0b9be3df9489..64a2f33ae9ea 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -46,11 +46,6 @@ struct partition_meta_info {
  * Must not be set for devices which are removed entirely when the
  * media is removed.
  *
- * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include
- * partition information in ``/proc/partitions`` or in the output of
- * printk_all_partitions().
- * Used for the null block device and some MMC devices.
- *
  * ``GENHD_FL_EXT_DEVT`` (0x0040): the driver supports extended
  * dynamic ``dev_t``, i.e. it wants extended device numbers
  * (``BLOCK_EXT_MAJOR``).
@@ -63,14 +58,12 @@ struct partition_meta_info {
  * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it
  * doesn't produce events, doesn't appear in sysfs, and doesn't have
  * an associated ``bdev``.
- * Implies ``GENHD_FL_SUPPRESS_PARTITION_INFO`` and
- * ``GENHD_FL_NO_PART``.
+ * Implies ``GENHD_FL_NO_PART``.
  * Used for multipath devices.
  */
 #define GENHD_FL_REMOVABLE			0x0001
 /* 2 is unused (used to be GENHD_FL_DRIVERFS) */
 /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */
-#define GENHD_FL_SUPPRESS_PARTITION_INFO	0x0020
 #define GENHD_FL_EXT_DEVT			0x0040
 #define GENHD_FL_NO_PART			0x0200
 #define GENHD_FL_HIDDEN				0x0400
-- 
cgit v1.2.3


From 1ebe2e5f9d68e94c524aba876f27b945669a7879 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:22 +0100
Subject: block: remove GENHD_FL_EXT_DEVT

All modern drivers can support extra partitions using the extended
dev_t.  In fact except for the ioctl method drivers never even see
partitions in normal operation.

So remove the GENHD_FL_EXT_DEVT and allow extra partitions for all
block devices that do support partitions, and require those that
do not support partitions to explicit disallow them using
GENHD_FL_NO_PART.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 28 +++++-----------------------
 1 file changed, 5 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 64a2f33ae9ea..b8ced80178d6 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -46,11 +46,6 @@ struct partition_meta_info {
  * Must not be set for devices which are removed entirely when the
  * media is removed.
  *
- * ``GENHD_FL_EXT_DEVT`` (0x0040): the driver supports extended
- * dynamic ``dev_t``, i.e. it wants extended device numbers
- * (``BLOCK_EXT_MAJOR``).
- * This affects the maximum number of partitions.
- *
  * ``GENHD_FL_NO_PART`` (0x0200): partition support is disabled.
  * The kernel will not scan for partitions from add_disk, and users
  * can't add partitions manually.
@@ -64,7 +59,6 @@ struct partition_meta_info {
 #define GENHD_FL_REMOVABLE			0x0001
 /* 2 is unused (used to be GENHD_FL_DRIVERFS) */
 /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */
-#define GENHD_FL_EXT_DEVT			0x0040
 #define GENHD_FL_NO_PART			0x0200
 #define GENHD_FL_HIDDEN				0x0400
 
@@ -94,13 +88,13 @@ struct blk_integrity {
 };
 
 struct gendisk {
-	/* major, first_minor and minors are input parameters only,
-	 * don't use directly.  Use disk_devt() and disk_max_parts().
+	/*
+	 * major/first_minor/minors should not be set by any new driver, the
+	 * block core will take care of allocating them automatically.
 	 */
-	int major;			/* major number of driver */
+	int major;
 	int first_minor;
-	int minors;                     /* maximum number of minors, =1 for
-                                         * disks that can't be partitioned. */
+	int minors;
 
 	char disk_name[DISK_NAME_LEN];	/* name of major driver */
 
@@ -164,18 +158,6 @@ static inline bool disk_live(struct gendisk *disk)
 #define disk_to_cdi(disk)	NULL
 #endif
 
-static inline int disk_max_parts(struct gendisk *disk)
-{
-	if (disk->flags & GENHD_FL_EXT_DEVT)
-		return DISK_MAX_PARTS;
-	return disk->minors;
-}
-
-static inline bool disk_part_scan_enabled(struct gendisk *disk)
-{
-	return disk_max_parts(disk) > 1 && !(disk->flags & GENHD_FL_NO_PART);
-}
-
 static inline dev_t disk_devt(struct gendisk *disk)
 {
 	return MKDEV(disk->major, disk->first_minor);
-- 
cgit v1.2.3


From 430cc5d3ab4d0ba0bd011cfbb0035e46ba92920c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:24 +0100
Subject: block: cleanup the GENHD_FL_* definitions

Switch to an enum and tidy up the documentation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-14-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index b8ced80178d6..6906a45bc761 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -39,28 +39,24 @@ struct partition_meta_info {
 /**
  * DOC: genhd capability flags
  *
- * ``GENHD_FL_REMOVABLE`` (0x0001): indicates that the block device
- * gives access to removable media.
- * When set, the device remains present even when media is not
- * inserted.
- * Must not be set for devices which are removed entirely when the
+ * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to
+ * removable media.  When set, the device remains present even when media is not
+ * inserted.  Shall not be set for devices which are removed entirely when the
  * media is removed.
  *
- * ``GENHD_FL_NO_PART`` (0x0200): partition support is disabled.
- * The kernel will not scan for partitions from add_disk, and users
- * can't add partitions manually.
+ * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events,
+ * doesn't appear in sysfs, and can't be opened from userspace or using
+ * blkdev_get*. Used for the underlying components of multipath devices.
+ *
+ * ``GENHD_FL_NO_PART``: partition support is disabled.  The kernel will not
+ * scan for partitions from add_disk, and users can't add partitions manually.
  *
- * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it
- * doesn't produce events, doesn't appear in sysfs, and doesn't have
- * an associated ``bdev``.
- * Implies ``GENHD_FL_NO_PART``.
- * Used for multipath devices.
  */
-#define GENHD_FL_REMOVABLE			0x0001
-/* 2 is unused (used to be GENHD_FL_DRIVERFS) */
-/* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */
-#define GENHD_FL_NO_PART			0x0200
-#define GENHD_FL_HIDDEN				0x0400
+enum {
+	GENHD_FL_REMOVABLE			= 1 << 0,
+	GENHD_FL_HIDDEN				= 1 << 1,
+	GENHD_FL_NO_PART			= 1 << 2,
+};
 
 enum {
 	DISK_EVENT_MEDIA_CHANGE			= 1 << 0, /* media changed */
-- 
cgit v1.2.3


From 48b5c1fbcd8c5bc6b91a56399a5257b801391dd8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Nov 2021 14:03:26 -0700
Subject: block: only allocate poll_stats if there's a user of them

This is essentially never used, yet it's about 1/3rd of the total
queue size. Allocate it when needed, and don't embed it in the queue.

Kill the queue flag for this while at it, since we can just check the
assigned pointer now.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bd4370baccca..74118e67f649 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -267,7 +267,7 @@ struct request_queue {
 	int			poll_nsec;
 
 	struct blk_stat_callback	*poll_cb;
-	struct blk_rq_stat	poll_stat[BLK_MQ_POLL_STATS_BKTS];
+	struct blk_rq_stat	*poll_stat;
 
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
@@ -397,7 +397,6 @@ struct request_queue {
 #define QUEUE_FLAG_FUA		18	/* device supports FUA writes */
 #define QUEUE_FLAG_DAX		19	/* device supports DAX */
 #define QUEUE_FLAG_STATS	20	/* track IO start and completion times */
-#define QUEUE_FLAG_POLL_STATS	21	/* collecting stats for hybrid polling */
 #define QUEUE_FLAG_REGISTERED	22	/* queue has been registered to a disk */
 #define QUEUE_FLAG_QUIESCED	24	/* queue has been quiesced */
 #define QUEUE_FLAG_PCI_P2PDMA	25	/* device supports PCI p2p requests */
-- 
cgit v1.2.3


From 72cd9df2ef788d88c138d51223a01ca6281f232d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 23 Nov 2021 17:37:33 -0800
Subject: blk-crypto: remove blk_crypto_unregister()

This function is trivial and is only used in one place.  Having this
function is misleading because it implies that blk_crypto_register()
needs to be paired with blk_crypto_unregister(), which is not the case.
Just set disk->queue->crypto_profile to NULL directly.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211124013733.347612-1-ebiggers@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 74118e67f649..0a4416ef4fbf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1170,8 +1170,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo
 bool blk_crypto_register(struct blk_crypto_profile *profile,
 			 struct request_queue *q);
 
-void blk_crypto_unregister(struct request_queue *q);
-
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
 
 static inline bool blk_crypto_register(struct blk_crypto_profile *profile,
@@ -1180,8 +1178,6 @@ static inline bool blk_crypto_register(struct blk_crypto_profile *profile,
 	return true;
 }
 
-static inline void blk_crypto_unregister(struct request_queue *q) { }
-
 #endif /* CONFIG_BLK_INLINE_ENCRYPTION */
 
 enum blk_unique_id {
-- 
cgit v1.2.3


From e8dc17e2893b4107366004810ca2a4acf1fc8563 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Oct 2021 09:06:57 +0200
Subject: blk-mq: Add blk_mq_complete_request_direct()

Add blk_mq_complete_request_direct() which completes the block request
directly instead deferring it to softirq for single queue devices.

This is useful for devices which complete the requests in preemptible
context and raising softirq from means scheduling ksoftirqd.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211025070658.1565848-2-bigeasy@linutronix.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 308edc2a4925..d952c3442261 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -752,6 +752,17 @@ static inline void blk_mq_set_request_complete(struct request *rq)
 	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
 }
 
+/*
+ * Complete the request directly instead of deferring it to softirq or
+ * completing it another CPU. Useful in preemptible instead of an interrupt.
+ */
+static inline void blk_mq_complete_request_direct(struct request *rq,
+		   void (*complete)(struct request *rq))
+{
+	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
+	complete(rq);
+}
+
 void blk_mq_start_request(struct request *rq);
 void blk_mq_end_request(struct request *rq, blk_status_t error);
 void __blk_mq_end_request(struct request *rq, blk_status_t error);
-- 
cgit v1.2.3


From 88c9a2ce520ba381bb70658c80ec704f4d60f728 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:05 +0100
Subject: fork: move copy_io to block/blk-ioc.c

Move the copying of the I/O context to the block layer as that is where
we can use the proper low-level interfaces.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/iocontext.h | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 0a9dc40b7be8..bcd47d104d8e 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -129,14 +129,6 @@ static inline void get_io_context_active(struct io_context *ioc)
 	atomic_inc(&ioc->active_ref);
 }
 
-static inline void ioc_task_link(struct io_context *ioc)
-{
-	get_io_context_active(ioc);
-
-	WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0);
-	atomic_inc(&ioc->nr_tasks);
-}
-
 struct task_struct;
 #ifdef CONFIG_BLOCK
 void put_io_context(struct io_context *ioc);
@@ -144,10 +136,21 @@ void put_io_context_active(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
+int __copy_io(unsigned long clone_flags, struct task_struct *tsk);
+static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk)
+{
+	if (!current->io_context)
+		return 0;
+	return __copy_io(clone_flags, tsk);
+}
 #else
 struct io_context;
 static inline void put_io_context(struct io_context *ioc) { }
 static inline void exit_io_context(struct task_struct *task) { }
-#endif
+static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk)
+{
+	return 0;
+}
+#endif /* CONFIG_BLOCK */
 
-#endif
+#endif /* IOCONTEXT_H */
-- 
cgit v1.2.3


From 3304742562d27fb87a6d8291cc48824dd20f6964 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:09 +0100
Subject: block: mark put_io_context_active static

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/iocontext.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index bcd47d104d8e..3ba45953d522 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -132,7 +132,6 @@ static inline void get_io_context_active(struct io_context *ioc)
 struct task_struct;
 #ifdef CONFIG_BLOCK
 void put_io_context(struct io_context *ioc);
-void put_io_context_active(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
-- 
cgit v1.2.3


From 50569c24be61eafb3efa06e2a3ccd447f75ae1b0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:12 +0100
Subject: block: remove get_io_context_active

Fold it into it's only caller, and remove a lof of the debug checks
that are not needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-10-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/iocontext.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 3ba45953d522..c1229fbd6691 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -113,22 +113,6 @@ struct io_context {
 	struct work_struct release_work;
 };
 
-/**
- * get_io_context_active - get active reference on ioc
- * @ioc: ioc of interest
- *
- * Only iocs with active reference can issue new IOs.  This function
- * acquires an active reference on @ioc.  The caller must already have an
- * active reference on @ioc.
- */
-static inline void get_io_context_active(struct io_context *ioc)
-{
-	WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0);
-	WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0);
-	atomic_long_inc(&ioc->refcount);
-	atomic_inc(&ioc->active_ref);
-}
-
 struct task_struct;
 #ifdef CONFIG_BLOCK
 void put_io_context(struct io_context *ioc);
-- 
cgit v1.2.3


From f3fa33acca9f0058157214800f68b10d8e71ab7a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 13:18:00 +0100
Subject: block: remove the ->rq_disk field in struct request

Just use the disk attached to the request_queue instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20211126121802.2090656-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d952c3442261..ede7bef8880a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -100,7 +100,6 @@ struct request {
 		struct request *rq_next;
 	};
 
-	struct gendisk *rq_disk;
 	struct block_device *part;
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
 	/* Time that the first bio started allocating this request. */
@@ -890,9 +889,6 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
 	rq->__data_len = bio->bi_iter.bi_size;
 	rq->bio = rq->biotail = bio;
 	rq->ioprio = bio_prio(bio);
-
-	if (bio->bi_bdev)
-		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 
 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
-- 
cgit v1.2.3


From b84ba30b6c7a75babdf73b83bc3c7b59b944501a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 13:18:01 +0100
Subject: block: remove the gendisk argument to blk_execute_rq

Remove the gendisk aregument to blk_execute_rq and blk_execute_rq_nowait
given that it is unused now.  Also convert the boolean at_head parameter
to actually use the bool type while touching the prototype.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20211126121802.2090656-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ede7bef8880a..1b87b7c8bbff 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -924,10 +924,9 @@ int blk_rq_unmap_user(struct bio *);
 int blk_rq_map_kern(struct request_queue *, struct request *, void *,
 		unsigned int, gfp_t);
 int blk_rq_append_bio(struct request *rq, struct bio *bio);
-void blk_execute_rq_nowait(struct gendisk *, struct request *, int,
-		rq_end_io_fn *);
-blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
-		int at_head);
+void blk_execute_rq_nowait(struct request *rq, bool at_head,
+		rq_end_io_fn *end_io);
+blk_status_t blk_execute_rq(struct request *rq, bool at_head);
 
 struct req_iterator {
 	struct bvec_iter iter;
-- 
cgit v1.2.3


From f7e5b9bfa6c8820407b64eabc1f29c9a87e8993d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 29 Nov 2021 10:39:29 -0500
Subject: siphash: use _unaligned version by default

On ARM v6 and later, we define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
because the ordinary load/store instructions (ldr, ldrh, ldrb) can
tolerate any misalignment of the memory address. However, load/store
double and load/store multiple instructions (ldrd, ldm) may still only
be used on memory addresses that are 32-bit aligned, and so we have to
use the CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS macro with care, or we
may end up with a severe performance hit due to alignment traps that
require fixups by the kernel. Testing shows that this currently happens
with clang-13 but not gcc-11. In theory, any compiler version can
produce this bug or other problems, as we are dealing with undefined
behavior in C99 even on architectures that support this in hardware,
see also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100363.

Fortunately, the get_unaligned() accessors do the right thing: when
building for ARMv6 or later, the compiler will emit unaligned accesses
using the ordinary load/store instructions (but avoid the ones that
require 32-bit alignment). When building for older ARM, those accessors
will emit the appropriate sequence of ldrb/mov/orr instructions. And on
architectures that can truly tolerate any kind of misalignment, the
get_unaligned() accessors resolve to the leXX_to_cpup accessors that
operate on aligned addresses.

Since the compiler will in fact emit ldrd or ldm instructions when
building this code for ARM v6 or later, the solution is to use the
unaligned accessors unconditionally on architectures where this is
known to be fast. The _aligned version of the hash function is
however still needed to get the best performance on architectures
that cannot do any unaligned access in hardware.

This new version avoids the undefined behavior and should produce
the fastest hash on all architectures we support.

Link: https://lore.kernel.org/linux-arm-kernel/20181008211554.5355-4-ard.biesheuvel@linaro.org/
Link: https://lore.kernel.org/linux-crypto/CAK8P3a2KfmmGDbVHULWevB0hv71P2oi2ZCHEAqT=8dQfa0=cqQ@mail.gmail.com/
Reported-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Fixes: 2c956a60778c ("siphash: add cryptographically secure PRF")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/siphash.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/siphash.h b/include/linux/siphash.h
index bf21591a9e5e..0cda61855d90 100644
--- a/include/linux/siphash.h
+++ b/include/linux/siphash.h
@@ -27,9 +27,7 @@ static inline bool siphash_key_is_zero(const siphash_key_t *key)
 }
 
 u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);
-#endif
 
 u64 siphash_1u64(const u64 a, const siphash_key_t *key);
 u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key);
@@ -82,10 +80,9 @@ static inline u64 ___siphash_aligned(const __le64 *data, size_t len,
 static inline u64 siphash(const void *data, size_t len,
 			  const siphash_key_t *key)
 {
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+	    !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
 		return __siphash_unaligned(data, len, key);
-#endif
 	return ___siphash_aligned(data, len, key);
 }
 
@@ -96,10 +93,8 @@ typedef struct {
 
 u32 __hsiphash_aligned(const void *data, size_t len,
 		       const hsiphash_key_t *key);
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u32 __hsiphash_unaligned(const void *data, size_t len,
 			 const hsiphash_key_t *key);
-#endif
 
 u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key);
 u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key);
@@ -135,10 +130,9 @@ static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len,
 static inline u32 hsiphash(const void *data, size_t len,
 			   const hsiphash_key_t *key)
 {
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+	    !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
 		return __hsiphash_unaligned(data, len, key);
-#endif
 	return ___hsiphash_aligned(data, len, key);
 }
 
-- 
cgit v1.2.3


From 4047b9db1aa7512a10ba3560a3f63821c8c40235 Mon Sep 17 00:00:00 2001
From: Bhupesh Sharma <bhupesh.sharma@linaro.org>
Date: Mon, 29 Nov 2021 01:28:54 +0530
Subject: net: stmmac: Add platform level debug register dump feature

dwmac-qcom-ethqos currently exposes a mechanism to dump rgmii registers
after the 'stmmac_dvr_probe()' returns. However with commit
5ec55823438e ("net: stmmac: add clocks management for gmac driver"),
we now let 'pm_runtime_put()' disable the clocks before returning from
'stmmac_dvr_probe()'.

This causes a crash when 'rgmii_dump()' register dumps are enabled,
as the clocks are already off.

Since other dwmac drivers (possible future users as well) might
require a similar register dump feature, introduce a platform level
callback to allow the same.

This fixes the crash noticed while enabling rgmii_dump() dumps in
dwmac-qcom-ethqos driver as well. It also allows future changes
to keep a invoking the register dump callback from the correct
place inside 'stmmac_dvr_probe()'.

Fixes: 5ec55823438e ("net: stmmac: add clocks management for gmac driver")
Cc: Joakim Zhang <qiangqing.zhang@nxp.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Bhupesh Sharma <bhupesh.sharma@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/stmmac.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 89b8e208cd7b..24eea1b05ca2 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -233,6 +233,7 @@ struct plat_stmmacenet_data {
 	int (*clks_config)(void *priv, bool enabled);
 	int (*crosststamp)(ktime_t *device, struct system_counterval_t *system,
 			   void *ctx);
+	void (*dump_debug_regs)(void *priv);
 	void *bsp_priv;
 	struct clk *stmmac_clk;
 	struct clk *pclk;
-- 
cgit v1.2.3


From 79364031c5b4365ca28ac0fa00acfab5bf465be1 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat, 27 Nov 2021 17:32:00 +0100
Subject: bpf: Make sure bpf_disable_instrumentation() is safe vs preemption.

The initial implementation of migrate_disable() for mainline was a
wrapper around preempt_disable(). RT kernels substituted this with a
real migrate disable implementation.

Later on mainline gained true migrate disable support, but neither
documentation nor affected code were updated.

Remove stale comments claiming that migrate_disable() is PREEMPT_RT only.

Don't use __this_cpu_inc() in the !PREEMPT_RT path because preemption is
not disabled and the RMW operation can be preempted.

Fixes: 74d862b682f51 ("sched: Make migrate_disable/enable() independent of RT")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211127163200.10466-3-bigeasy@linutronix.de
---
 include/linux/bpf.h    | 16 ++--------------
 include/linux/filter.h |  3 ---
 2 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 84ff6ef49462..755f38e893be 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1353,28 +1353,16 @@ extern struct mutex bpf_stats_enabled_mutex;
  * kprobes, tracepoints) to prevent deadlocks on map operations as any of
  * these events can happen inside a region which holds a map bucket lock
  * and can deadlock on it.
- *
- * Use the preemption safe inc/dec variants on RT because migrate disable
- * is preemptible on RT and preemption in the middle of the RMW operation
- * might lead to inconsistent state. Use the raw variants for non RT
- * kernels as migrate_disable() maps to preempt_disable() so the slightly
- * more expensive save operation can be avoided.
  */
 static inline void bpf_disable_instrumentation(void)
 {
 	migrate_disable();
-	if (IS_ENABLED(CONFIG_PREEMPT_RT))
-		this_cpu_inc(bpf_prog_active);
-	else
-		__this_cpu_inc(bpf_prog_active);
+	this_cpu_inc(bpf_prog_active);
 }
 
 static inline void bpf_enable_instrumentation(void)
 {
-	if (IS_ENABLED(CONFIG_PREEMPT_RT))
-		this_cpu_dec(bpf_prog_active);
-	else
-		__this_cpu_dec(bpf_prog_active);
+	this_cpu_dec(bpf_prog_active);
 	migrate_enable();
 }
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 24b7ed2677af..534f678ca50f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -640,9 +640,6 @@ static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void
  * This uses migrate_disable/enable() explicitly to document that the
  * invocation of a BPF program does not require reentrancy protection
  * against a BPF program which is invoked from a preempting task.
- *
- * For non RT enabled kernels migrate_disable/enable() maps to
- * preempt_disable/enable(), i.e. it disables also preemption.
  */
 static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
 					  const void *ctx)
-- 
cgit v1.2.3


From e6f2dd0f80674e9d5960337b3e9c2a242441b326 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannekoong@fb.com>
Date: Mon, 29 Nov 2021 19:06:19 -0800
Subject: bpf: Add bpf_loop helper

This patch adds the kernel-side and API changes for a new helper
function, bpf_loop:

long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
u64 flags);

where long (*callback_fn)(u32 index, void *ctx);

bpf_loop invokes the "callback_fn" **nr_loops** times or until the
callback_fn returns 1. The callback_fn can only return 0 or 1, and
this is enforced by the verifier. The callback_fn index is zero-indexed.

A few things to please note:
~ The "u64 flags" parameter is currently unused but is included in
case a future use case for it arises.
~ In the kernel-side implementation of bpf_loop (kernel/bpf/bpf_iter.c),
bpf_callback_t is used as the callback function cast.
~ A program can have nested bpf_loop calls but the program must
still adhere to the verifier constraint of its stack depth (the stack depth
cannot exceed MAX_BPF_STACK))
~ Recursive callback_fns do not pass the verifier, due to the call stack
for these being too deep.
~ The next patch will include the tests and benchmark

Signed-off-by: Joanne Koong <joannekoong@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211130030622.4131246-2-joannekoong@fb.com
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cc7a0c36e7df..cad0829710be 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2164,6 +2164,7 @@ extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
+extern const struct bpf_func_proto bpf_loop_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
-- 
cgit v1.2.3


From 7ad639840acf2800b5f387c495795f995a67a329 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 29 Nov 2021 13:06:43 +0000
Subject: thread_info: Add helpers to snapshot thread flags

In <linux/thread_info.h> there are helpers to manipulate individual thread
flags, but where code wants to check several flags at once, it must open
code reading current_thread_info()->flags and operating on a snapshot.

As some flags can be set remotely it's necessary to use READ_ONCE() to get
a consistent snapshot even when IRQs are disabled, but some code forgets to
do this. Generally this is unlike to cause a problem in practice, but it is
somewhat unsound, and KCSAN will legitimately warn that there is a data
race.

To make it easier to do the right thing, and to highlight that concurrent
modification is possible, add new helpers to snapshot the flags, which
should be used in preference to plain reads. Subsequent patches will move
existing code to use the new helpers.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marco Elver <elver@google.com>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20211129130653.2037928-2-mark.rutland@arm.com
---
 include/linux/thread_info.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ad0c4e041030..73a6f34b3847 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -118,6 +118,15 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 	return test_bit(flag, (unsigned long *)&ti->flags);
 }
 
+/*
+ * This may be used in noinstr code, and needs to be __always_inline to prevent
+ * inadvertent instrumentation.
+ */
+static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti)
+{
+	return READ_ONCE(ti->flags);
+}
+
 #define set_thread_flag(flag) \
 	set_ti_thread_flag(current_thread_info(), flag)
 #define clear_thread_flag(flag) \
@@ -130,6 +139,11 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 	test_and_clear_ti_thread_flag(current_thread_info(), flag)
 #define test_thread_flag(flag) \
 	test_ti_thread_flag(current_thread_info(), flag)
+#define read_thread_flags() \
+	read_ti_thread_flags(current_thread_info())
+
+#define read_task_thread_flags(t) \
+	read_ti_thread_flags(task_thread_info(t))
 
 #ifdef CONFIG_GENERIC_ENTRY
 #define set_syscall_work(fl) \
-- 
cgit v1.2.3


From 6ce895128b3bff738fe8d9dd74747a03e319e466 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 29 Nov 2021 13:06:44 +0000
Subject: entry: Snapshot thread flags

Some thread flags can be set remotely, and so even when IRQs are disabled,
the flags can change under our feet. Generally this is unlikely to cause a
problem in practice, but it is somewhat unsound, and KCSAN will
legitimately warn that there is a data race.

To avoid such issues, a snapshot of the flags has to be taken prior to
using them. Some places already use READ_ONCE() for that, others do not.

Convert them all to the new flag accessor helpers.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/r/20211129130653.2037928-3-mark.rutland@arm.com
---
 include/linux/entry-kvm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 0d7865a0731c..07c878d6e323 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -75,7 +75,7 @@ static inline void xfer_to_guest_mode_prepare(void)
  */
 static inline bool __xfer_to_guest_mode_work_pending(void)
 {
-	unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+	unsigned long ti_work = read_thread_flags();
 
 	return !!(ti_work & XFER_TO_GUEST_MODE_WORK);
 }
-- 
cgit v1.2.3


From 4946f15e8c334840bf277a0bf924371eae120fcd Mon Sep 17 00:00:00 2001
From: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Date: Tue, 30 Nov 2021 22:40:43 +0100
Subject: genirq/generic_chip: Constify irq_generic_chip_ops

The only usage of irq_generic_chip_ops is to pass its address to
irq_domain_add_linear() which takes a pointer to const struct
irq_domain_ops. Make it const to allow the compiler to put it in
read-only memory.

[ tglx: Fixed subject prefix ]

Signed-off-by: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20211130214043.1257585-1-rikard.falkeborn@gmail.com
---
 include/linux/irqdomain.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 553da4899f55..d476405802e9 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -131,7 +131,7 @@ struct irq_domain_ops {
 #endif
 };
 
-extern struct irq_domain_ops irq_generic_chip_ops;
+extern const struct irq_domain_ops irq_generic_chip_ops;
 
 struct irq_domain_chip_generic;
 
-- 
cgit v1.2.3


From 24ba53017e188e031f9cb8b290286fad52d2af00 Mon Sep 17 00:00:00 2001
From: Chun-Hung Tseng <henrybear327@gmail.com>
Date: Wed, 15 Sep 2021 17:02:18 +0800
Subject: rcu: Replace ________p1 and _________p1 with __UNIQUE_ID(rcu)

This commit replaces both ________p1 and _________p1 with __UNIQUE_ID(rcu),
and also adjusts the callers of the affected macros.

__UNIQUE_ID(rcu) will generate unique variable names during compilation,
which eliminates the need of ________p1 and _________p1 (both having 4
occurrences prior to the code change).  This also avoids the variable
name shadowing issue, or at least makes those wishing to cause shadowing
problems work much harder to do so.

The same idea is used for the min/max macros (commit 589a978 and commit
e9092d0).

Signed-off-by: Jim Huang <jserv@ccns.ncku.edu.tw>
Signed-off-by: Chun-Hung Tseng <henrybear327@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 50 +++++++++++++++++++++++++++---------------------
 include/linux/srcu.h     |  3 ++-
 2 files changed, 30 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5e0beb5c5659..88b42eb46406 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -364,6 +364,12 @@ static inline void rcu_preempt_sleep_check(void) { }
 #define rcu_check_sparse(p, space)
 #endif /* #else #ifdef __CHECKER__ */
 
+#define __unrcu_pointer(p, local)					\
+({									\
+	typeof(*p) *local = (typeof(*p) *__force)(p);			\
+	rcu_check_sparse(p, __rcu);					\
+	((typeof(*p) __force __kernel *)(local)); 			\
+})
 /**
  * unrcu_pointer - mark a pointer as not being RCU protected
  * @p: pointer needing to lose its __rcu property
@@ -371,39 +377,35 @@ static inline void rcu_preempt_sleep_check(void) { }
  * Converts @p from an __rcu pointer to a __kernel pointer.
  * This allows an __rcu pointer to be used with xchg() and friends.
  */
-#define unrcu_pointer(p)						\
-({									\
-	typeof(*p) *_________p1 = (typeof(*p) *__force)(p);		\
-	rcu_check_sparse(p, __rcu);					\
-	((typeof(*p) __force __kernel *)(_________p1)); 		\
-})
+#define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu))
 
-#define __rcu_access_pointer(p, space) \
+#define __rcu_access_pointer(p, local, space) \
 ({ \
-	typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
+	typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
 	rcu_check_sparse(p, space); \
-	((typeof(*p) __force __kernel *)(_________p1)); \
+	((typeof(*p) __force __kernel *)(local)); \
 })
-#define __rcu_dereference_check(p, c, space) \
+#define __rcu_dereference_check(p, local, c, space) \
 ({ \
 	/* Dependency order vs. p above. */ \
-	typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \
+	typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
 	RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
 	rcu_check_sparse(p, space); \
-	((typeof(*p) __force __kernel *)(________p1)); \
+	((typeof(*p) __force __kernel *)(local)); \
 })
-#define __rcu_dereference_protected(p, c, space) \
+#define __rcu_dereference_protected(p, local, c, space) \
 ({ \
 	RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \
 	rcu_check_sparse(p, space); \
 	((typeof(*p) __force __kernel *)(p)); \
 })
-#define rcu_dereference_raw(p) \
+#define __rcu_dereference_raw(p, local) \
 ({ \
 	/* Dependency order vs. p above. */ \
-	typeof(p) ________p1 = READ_ONCE(p); \
-	((typeof(*p) __force __kernel *)(________p1)); \
+	typeof(p) local = READ_ONCE(p); \
+	((typeof(*p) __force __kernel *)(local)); \
 })
+#define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu))
 
 /**
  * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
@@ -490,7 +492,7 @@ do {									      \
  * when tearing down multi-linked structures after a grace period
  * has elapsed.
  */
-#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu)
+#define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu)
 
 /**
  * rcu_dereference_check() - rcu_dereference with debug checking
@@ -526,7 +528,8 @@ do {									      \
  * annotated as __rcu.
  */
 #define rcu_dereference_check(p, c) \
-	__rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu)
+	__rcu_dereference_check((p), __UNIQUE_ID(rcu), \
+				(c) || rcu_read_lock_held(), __rcu)
 
 /**
  * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
@@ -541,7 +544,8 @@ do {									      \
  * rcu_read_lock() but also rcu_read_lock_bh() into account.
  */
 #define rcu_dereference_bh_check(p, c) \
-	__rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu)
+	__rcu_dereference_check((p), __UNIQUE_ID(rcu), \
+				(c) || rcu_read_lock_bh_held(), __rcu)
 
 /**
  * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
@@ -556,7 +560,8 @@ do {									      \
  * only rcu_read_lock() but also rcu_read_lock_sched() into account.
  */
 #define rcu_dereference_sched_check(p, c) \
-	__rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \
+	__rcu_dereference_check((p), __UNIQUE_ID(rcu), \
+				(c) || rcu_read_lock_sched_held(), \
 				__rcu)
 
 /*
@@ -566,7 +571,8 @@ do {									      \
  * The no-tracing version of rcu_dereference_raw() must not call
  * rcu_read_lock_held().
  */
-#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu)
+#define rcu_dereference_raw_check(p) \
+	__rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu)
 
 /**
  * rcu_dereference_protected() - fetch RCU pointer when updates prevented
@@ -585,7 +591,7 @@ do {									      \
  * but very ugly failures.
  */
 #define rcu_dereference_protected(p, c) \
-	__rcu_dereference_protected((p), (c), __rcu)
+	__rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu)
 
 
 /**
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index e6011a9975af..01226e4d960a 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -117,7 +117,8 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
  * lockdep_is_held() calls.
  */
 #define srcu_dereference_check(p, ssp, c) \
-	__rcu_dereference_check((p), (c) || srcu_read_lock_held(ssp), __rcu)
+	__rcu_dereference_check((p), __UNIQUE_ID(rcu), \
+				(c) || srcu_read_lock_held(ssp), __rcu)
 
 /**
  * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
-- 
cgit v1.2.3


From 2407a64f8045552203ee5cb9904ce75ce2fceef4 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Tue, 28 Sep 2021 08:21:28 +0800
Subject: rcu: in_irq() cleanup

This commit replaces the obsolete and ambiguous macro in_irq() with its
shiny new in_hardirq() equivalent.

Signed-off-by: Changbin Du <changbin.du@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 9be015305f9f..858f4d429946 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -85,7 +85,7 @@ static inline void rcu_irq_enter_irqson(void) { }
 static inline void rcu_irq_exit(void) { }
 static inline void rcu_irq_exit_check_preempt(void) { }
 #define rcu_is_idle_cpu(cpu) \
-	(is_idle_task(current) && !in_nmi() && !in_irq() && !in_serving_softirq())
+	(is_idle_task(current) && !in_nmi() && !in_hardirq() && !in_serving_softirq())
 static inline void exit_rcu(void) { }
 static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 {
-- 
cgit v1.2.3


From 502e82b91361955c66c8453b5b7a905b0b5bd5a1 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Sun, 7 Nov 2021 17:21:45 +0200
Subject: net/mlx5: Fix access to a non-supported register

Validate MRTC register is supported before triggering a delayed work
which accesses it.

Fixes: 5a1023deeed0 ("net/mlx5: Add periodic update of host time to firmware")
Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3636df90899a..fbaab440a484 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9698,7 +9698,10 @@ struct mlx5_ifc_mcam_access_reg_bits {
 	u8         regs_84_to_68[0x11];
 	u8         tracer_registers[0x4];
 
-	u8         regs_63_to_32[0x20];
+	u8         regs_63_to_46[0x12];
+	u8         mrtc[0x1];
+	u8         regs_44_to_32[0xd];
+
 	u8         regs_31_to_0[0x20];
 };
 
-- 
cgit v1.2.3


From af3bf054661fb11497a7f712ece8b838521227a4 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Wed, 1 Dec 2021 01:17:36 +0000
Subject: cgroup: fix a typo in comment

In commit 8699b7762a62 ("cgroup: s/child_subsys_mask/subtree_ss_mask/"),
we rename child_subsys_mask to subtree_ss_mask. While it missed to
rename this in comment.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index db2e147e069f..bb1e79791ed5 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -413,7 +413,7 @@ struct cgroup {
 	/*
 	 * The bitmask of subsystems enabled on the child cgroups.
 	 * ->subtree_control is the one configured through
-	 * "cgroup.subtree_control" while ->child_ss_mask is the effective
+	 * "cgroup.subtree_control" while ->subtree_ss_mask is the effective
 	 * one which may have more subsystems enabled.  Controller knobs
 	 * are made available iff it's enabled in ->subtree_control.
 	 */
-- 
cgit v1.2.3


From 6bbfa44116689469267f1a6e3d233b52114139d2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 1 Dec 2021 23:45:50 +0900
Subject: kprobes: Limit max data_size of the kretprobe instances

The 'kprobe::data_size' is unsigned, thus it can not be negative.  But if
user sets it enough big number (e.g. (size_t)-8), the result of 'data_size
+ sizeof(struct kretprobe_instance)' becomes smaller than sizeof(struct
kretprobe_instance) or zero. In result, the kretprobe_instance are
allocated without enough memory, and kretprobe accesses outside of
allocated memory.

To avoid this issue, introduce a max limitation of the
kretprobe::data_size. 4KB per instance should be OK.

Link: https://lkml.kernel.org/r/163836995040.432120.10322772773821182925.stgit@devnote2

Cc: stable@vger.kernel.org
Fixes: f47cd9b553aa ("kprobes: kretprobe user entry-handler")
Reported-by: zhangyue <zhangyue1@kylinos.cn>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e974caf39d3e..8c8f7a4d93af 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -153,6 +153,8 @@ struct kretprobe {
 	struct kretprobe_holder *rph;
 };
 
+#define KRETPROBE_MAX_DATA_SIZE	4096
+
 struct kretprobe_instance {
 	union {
 		struct freelist_node freelist;
-- 
cgit v1.2.3


From 7a10d8c810cfad3e79372d7d1c77899d86cd6662 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 30 Nov 2021 09:01:55 -0800
Subject: net: annotate data-races on txq->xmit_lock_owner

syzbot found that __dev_queue_xmit() is reading txq->xmit_lock_owner
without annotations.

No serious issue there, let's document what is happening there.

BUG: KCSAN: data-race in __dev_queue_xmit / __dev_queue_xmit

write to 0xffff888139d09484 of 4 bytes by interrupt on cpu 0:
 __netif_tx_unlock include/linux/netdevice.h:4437 [inline]
 __dev_queue_xmit+0x948/0xf70 net/core/dev.c:4229
 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265
 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline]
 macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567
 __netdev_start_xmit include/linux/netdevice.h:4987 [inline]
 netdev_start_xmit include/linux/netdevice.h:5001 [inline]
 xmit_one+0x105/0x2f0 net/core/dev.c:3590
 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606
 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342
 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817
 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194
 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259
 neigh_hh_output include/net/neighbour.h:511 [inline]
 neigh_output include/net/neighbour.h:525 [inline]
 ip6_finish_output2+0x995/0xbb0 net/ipv6/ip6_output.c:126
 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline]
 ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201
 NF_HOOK_COND include/linux/netfilter.h:296 [inline]
 ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224
 dst_output include/net/dst.h:450 [inline]
 NF_HOOK include/linux/netfilter.h:307 [inline]
 ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508
 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702
 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898
 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421
 expire_timers+0x116/0x240 kernel/time/timer.c:1466
 __run_timers+0x368/0x410 kernel/time/timer.c:1734
 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747
 __do_softirq+0x158/0x2de kernel/softirq.c:558
 __irq_exit_rcu kernel/softirq.c:636 [inline]
 irq_exit_rcu+0x37/0x70 kernel/softirq.c:648
 sysvec_apic_timer_interrupt+0x3e/0xb0 arch/x86/kernel/apic/apic.c:1097
 asm_sysvec_apic_timer_interrupt+0x12/0x20

read to 0xffff888139d09484 of 4 bytes by interrupt on cpu 1:
 __dev_queue_xmit+0x5e3/0xf70 net/core/dev.c:4213
 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265
 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline]
 macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567
 __netdev_start_xmit include/linux/netdevice.h:4987 [inline]
 netdev_start_xmit include/linux/netdevice.h:5001 [inline]
 xmit_one+0x105/0x2f0 net/core/dev.c:3590
 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606
 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342
 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817
 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194
 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259
 neigh_resolve_output+0x3db/0x410 net/core/neighbour.c:1523
 neigh_output include/net/neighbour.h:527 [inline]
 ip6_finish_output2+0x9be/0xbb0 net/ipv6/ip6_output.c:126
 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline]
 ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201
 NF_HOOK_COND include/linux/netfilter.h:296 [inline]
 ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224
 dst_output include/net/dst.h:450 [inline]
 NF_HOOK include/linux/netfilter.h:307 [inline]
 ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508
 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702
 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898
 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421
 expire_timers+0x116/0x240 kernel/time/timer.c:1466
 __run_timers+0x368/0x410 kernel/time/timer.c:1734
 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747
 __do_softirq+0x158/0x2de kernel/softirq.c:558
 __irq_exit_rcu kernel/softirq.c:636 [inline]
 irq_exit_rcu+0x37/0x70 kernel/softirq.c:648
 sysvec_apic_timer_interrupt+0x8d/0xb0 arch/x86/kernel/apic/apic.c:1097
 asm_sysvec_apic_timer_interrupt+0x12/0x20
 kcsan_setup_watchpoint+0x94/0x420 kernel/kcsan/core.c:443
 folio_test_anon include/linux/page-flags.h:581 [inline]
 PageAnon include/linux/page-flags.h:586 [inline]
 zap_pte_range+0x5ac/0x10e0 mm/memory.c:1347
 zap_pmd_range mm/memory.c:1467 [inline]
 zap_pud_range mm/memory.c:1496 [inline]
 zap_p4d_range mm/memory.c:1517 [inline]
 unmap_page_range+0x2dc/0x3d0 mm/memory.c:1538
 unmap_single_vma+0x157/0x210 mm/memory.c:1583
 unmap_vmas+0xd0/0x180 mm/memory.c:1615
 exit_mmap+0x23d/0x470 mm/mmap.c:3170
 __mmput+0x27/0x1b0 kernel/fork.c:1113
 mmput+0x3d/0x50 kernel/fork.c:1134
 exit_mm+0xdb/0x170 kernel/exit.c:507
 do_exit+0x608/0x17a0 kernel/exit.c:819
 do_group_exit+0xce/0x180 kernel/exit.c:929
 get_signal+0xfc3/0x1550 kernel/signal.c:2852
 arch_do_signal_or_restart+0x8c/0x2e0 arch/x86/kernel/signal.c:868
 handle_signal_work kernel/entry/common.c:148 [inline]
 exit_to_user_mode_loop kernel/entry/common.c:172 [inline]
 exit_to_user_mode_prepare+0x113/0x190 kernel/entry/common.c:207
 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline]
 syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300
 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86
 entry_SYSCALL_64_after_hwframe+0x44/0xae

value changed: 0x00000000 -> 0xffffffff

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 28712 Comm: syz-executor.0 Tainted: G        W         5.16.0-rc1-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Link: https://lore.kernel.org/r/20211130170155.2331929-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec42495a43a..be5cb3360b94 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4404,7 +4404,8 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
 static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
 {
 	spin_lock(&txq->_xmit_lock);
-	txq->xmit_lock_owner = cpu;
+	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	WRITE_ONCE(txq->xmit_lock_owner, cpu);
 }
 
 static inline bool __netif_tx_acquire(struct netdev_queue *txq)
@@ -4421,26 +4422,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq)
 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
 {
 	spin_lock_bh(&txq->_xmit_lock);
-	txq->xmit_lock_owner = smp_processor_id();
+	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
 }
 
 static inline bool __netif_tx_trylock(struct netdev_queue *txq)
 {
 	bool ok = spin_trylock(&txq->_xmit_lock);
-	if (likely(ok))
-		txq->xmit_lock_owner = smp_processor_id();
+
+	if (likely(ok)) {
+		/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+		WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
+	}
 	return ok;
 }
 
 static inline void __netif_tx_unlock(struct netdev_queue *txq)
 {
-	txq->xmit_lock_owner = -1;
+	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	WRITE_ONCE(txq->xmit_lock_owner, -1);
 	spin_unlock(&txq->_xmit_lock);
 }
 
 static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
 {
-	txq->xmit_lock_owner = -1;
+	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	WRITE_ONCE(txq->xmit_lock_owner, -1);
 	spin_unlock_bh(&txq->_xmit_lock);
 }
 
-- 
cgit v1.2.3


From 57b2b72ac1fc5d55cf3b13207942c109f1a65cb5 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Mon, 29 Nov 2021 09:57:59 -0700
Subject: mm, slab: Remove compiler check in __kmalloc_index

The minimum supported version of LLVM has been raised to 11.0.0, meaning
this check is always true, so it can be dropped.

Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/slab.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 181045148b06..d3fb5ac71c24 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -411,8 +411,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
 	if (size <=  16 * 1024 * 1024) return 24;
 	if (size <=  32 * 1024 * 1024) return 25;
 
-	if ((IS_ENABLED(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 110000)
-	    && !IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
+	if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
 		BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
 	else
 		BUG();
-- 
cgit v1.2.3


From e7f2be115f0746b969c0df14c0d182f65f005ca5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 26 Oct 2021 16:10:55 +0200
Subject: sched/cputime: Fix getrusage(RUSAGE_THREAD) with nohz_full

getrusage(RUSAGE_THREAD) with nohz_full may return shorter utime/stime
than the actual time.

task_cputime_adjusted() snapshots utime and stime and then adjust their
sum to match the scheduler maintained cputime.sum_exec_runtime.
Unfortunately in nohz_full, sum_exec_runtime is only updated once per
second in the worst case, causing a discrepancy against utime and stime
that can be updated anytime by the reader using vtime.

To fix this situation, perform an update of cputime.sum_exec_runtime
when the cputime snapshot reports the task as actually running while
the tick is disabled. The related overhead is then contained within the
relevant situations.

Reported-by: Hasegawa Hitomi <hasegawa-hitomi@fujitsu.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Hasegawa Hitomi <hasegawa-hitomi@fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
Acked-by: Phil Auld <pauld@redhat.com>
Link: https://lore.kernel.org/r/20211026141055.57358-3-frederic@kernel.org
---
 include/linux/sched/cputime.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 6c9f19a33865..ce3c58286062 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -18,15 +18,16 @@
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern void task_cputime(struct task_struct *t,
+extern bool task_cputime(struct task_struct *t,
 			 u64 *utime, u64 *stime);
 extern u64 task_gtime(struct task_struct *t);
 #else
-static inline void task_cputime(struct task_struct *t,
+static inline bool task_cputime(struct task_struct *t,
 				u64 *utime, u64 *stime)
 {
 	*utime = t->utime;
 	*stime = t->stime;
+	return false;
 }
 
 static inline u64 task_gtime(struct task_struct *t)
-- 
cgit v1.2.3


From f83baa0cb6cfc92ebaf7f9d3a99d7e34f2e77a8a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 1 Dec 2021 19:35:01 +0100
Subject: HID: add hid_is_usb() function to make it simpler for USB detection

A number of HID drivers already call hid_is_using_ll_driver() but only
for the detection of if this is a USB device or not.  Make this more
obvious by creating hid_is_usb() and calling the function that way.

Also converts the existing hid_is_using_ll_driver() functions to use the
new call.

Cc: Jiri Kosina <jikos@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: linux-input@vger.kernel.org
Cc: stable@vger.kernel.org
Tested-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20211201183503.2373082-1-gregkh@linuxfoundation.org
---
 include/linux/hid.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 9e067f937dbc..f453be385bd4 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -840,6 +840,11 @@ static inline bool hid_is_using_ll_driver(struct hid_device *hdev,
 	return hdev->ll_driver == driver;
 }
 
+static inline bool hid_is_usb(struct hid_device *hdev)
+{
+	return hid_is_using_ll_driver(hdev, &usb_hid_driver);
+}
+
 #define	PM_HINT_FULLON	1<<5
 #define PM_HINT_NORMAL	1<<1
 
-- 
cgit v1.2.3


From 9e3562080950b6e3fe38a5e34ddd5b1c618f2019 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Thu, 2 Dec 2021 10:53:33 +0100
Subject: HID: add suspend/resume helpers

There is a lot of duplication of code in the HID low level drivers.
Better have everything in one place so we can eventually extend it
in a generic way.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20211202095334.14399-4-benjamin.tissoires@redhat.com
---
 include/linux/hid.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 9e067f937dbc..ebe3ec98db6b 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -923,6 +923,16 @@ s32 hid_snto32(__u32 value, unsigned n);
 __u32 hid_field_extract(const struct hid_device *hid, __u8 *report,
 		     unsigned offset, unsigned n);
 
+#ifdef CONFIG_PM
+int hid_driver_suspend(struct hid_device *hdev, pm_message_t state);
+int hid_driver_reset_resume(struct hid_device *hdev);
+int hid_driver_resume(struct hid_device *hdev);
+#else
+static inline int hid_driver_suspend(struct hid_device *hdev, pm_message_t state) { return 0; }
+static inline int hid_driver_reset_resume(struct hid_device *hdev) { return 0; }
+static inline int hid_driver_resume(struct hid_device *hdev) { return 0; }
+#endif
+
 /**
  * hid_device_io_start - enable HID input during probe, remove
  *
-- 
cgit v1.2.3


From f65a0b1f3e79444bba9ac56435eeb32db85ab2c9 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Thu, 2 Dec 2021 10:53:34 +0100
Subject: HID: do not inline some hid_hw_ functions

We don't gain much by having them as inline, and it
actually prevents us to attach a probe to those helpers.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20211202095334.14399-5-benjamin.tissoires@redhat.com
---
 include/linux/hid.h | 68 +++++------------------------------------------------
 1 file changed, 6 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index ebe3ec98db6b..b2fea7fc54a1 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -1066,6 +1066,12 @@ int __must_check hid_hw_start(struct hid_device *hdev,
 void hid_hw_stop(struct hid_device *hdev);
 int __must_check hid_hw_open(struct hid_device *hdev);
 void hid_hw_close(struct hid_device *hdev);
+void hid_hw_request(struct hid_device *hdev,
+		    struct hid_report *report, int reqtype);
+int hid_hw_raw_request(struct hid_device *hdev,
+		       unsigned char reportnum, __u8 *buf,
+		       size_t len, unsigned char rtype, int reqtype);
+int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len);
 
 /**
  * hid_hw_power - requests underlying HW to go into given power mode
@@ -1083,68 +1089,6 @@ static inline int hid_hw_power(struct hid_device *hdev, int level)
 }
 
 
-/**
- * hid_hw_request - send report request to device
- *
- * @hdev: hid device
- * @report: report to send
- * @reqtype: hid request type
- */
-static inline void hid_hw_request(struct hid_device *hdev,
-				  struct hid_report *report, int reqtype)
-{
-	if (hdev->ll_driver->request)
-		return hdev->ll_driver->request(hdev, report, reqtype);
-
-	__hid_request(hdev, report, reqtype);
-}
-
-/**
- * hid_hw_raw_request - send report request to device
- *
- * @hdev: hid device
- * @reportnum: report ID
- * @buf: in/out data to transfer
- * @len: length of buf
- * @rtype: HID report type
- * @reqtype: HID_REQ_GET_REPORT or HID_REQ_SET_REPORT
- *
- * Return: count of data transferred, negative if error
- *
- * Same behavior as hid_hw_request, but with raw buffers instead.
- */
-static inline int hid_hw_raw_request(struct hid_device *hdev,
-				  unsigned char reportnum, __u8 *buf,
-				  size_t len, unsigned char rtype, int reqtype)
-{
-	if (len < 1 || len > HID_MAX_BUFFER_SIZE || !buf)
-		return -EINVAL;
-
-	return hdev->ll_driver->raw_request(hdev, reportnum, buf, len,
-						    rtype, reqtype);
-}
-
-/**
- * hid_hw_output_report - send output report to device
- *
- * @hdev: hid device
- * @buf: raw data to transfer
- * @len: length of buf
- *
- * Return: count of data transferred, negative if error
- */
-static inline int hid_hw_output_report(struct hid_device *hdev, __u8 *buf,
-					size_t len)
-{
-	if (len < 1 || len > HID_MAX_BUFFER_SIZE || !buf)
-		return -EINVAL;
-
-	if (hdev->ll_driver->output_report)
-		return hdev->ll_driver->output_report(hdev, buf, len);
-
-	return -ENOSYS;
-}
-
 /**
  * hid_hw_idle - send idle request to device
  *
-- 
cgit v1.2.3


From 8293eb995f349aed28006792cad4cb48091919dd Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:25 -0800
Subject: bpf: Rename btf_member accessors.

Rename btf_member_bit_offset() and btf_member_bitfield_size() to
avoid conflicts with similarly named helpers in libbpf's btf.h.
Rename the kernel helpers, since libbpf helpers are part of uapi.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-3-alexei.starovoitov@gmail.com
---
 include/linux/btf.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 203eef993d76..956f70388f69 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -194,15 +194,15 @@ static inline bool btf_type_kflag(const struct btf_type *t)
 	return BTF_INFO_KFLAG(t->info);
 }
 
-static inline u32 btf_member_bit_offset(const struct btf_type *struct_type,
-					const struct btf_member *member)
+static inline u32 __btf_member_bit_offset(const struct btf_type *struct_type,
+					  const struct btf_member *member)
 {
 	return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
 					   : member->offset;
 }
 
-static inline u32 btf_member_bitfield_size(const struct btf_type *struct_type,
-					   const struct btf_member *member)
+static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type,
+					     const struct btf_member *member)
 {
 	return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
 					   : 0;
-- 
cgit v1.2.3


From 29db4bea1d10b73749d7992c1fc9ac13499e8871 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:26 -0800
Subject: bpf: Prepare relo_core.c for kernel duty.

Make relo_core.c to be compiled for the kernel and for user space libbpf.

Note the patch is reducing BPF_CORE_SPEC_MAX_LEN from 64 to 32.
This is the maximum number of nested structs and arrays.
For example:
 struct sample {
     int a;
     struct {
         int b[10];
     };
 };

 struct sample *s = ...;
 int *y = &s->b[5];
This field access is encoded as "0:1:0:5" and spec len is 4.

The follow up patch might bump it back to 64.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-4-alexei.starovoitov@gmail.com
---
 include/linux/btf.h | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 956f70388f69..acef6ef28768 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -144,6 +144,53 @@ static inline bool btf_type_is_enum(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM;
 }
 
+static inline bool str_is_empty(const char *s)
+{
+	return !s || !s[0];
+}
+
+static inline u16 btf_kind(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info);
+}
+
+static inline bool btf_is_enum(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_ENUM;
+}
+
+static inline bool btf_is_composite(const struct btf_type *t)
+{
+	u16 kind = btf_kind(t);
+
+	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
+static inline bool btf_is_array(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_ARRAY;
+}
+
+static inline bool btf_is_int(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_INT;
+}
+
+static inline bool btf_is_ptr(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_PTR;
+}
+
+static inline u8 btf_int_offset(const struct btf_type *t)
+{
+	return BTF_INT_OFFSET(*(u32 *)(t + 1));
+}
+
+static inline u8 btf_int_encoding(const struct btf_type *t)
+{
+	return BTF_INT_ENCODING(*(u32 *)(t + 1));
+}
+
 static inline bool btf_type_is_scalar(const struct btf_type *t)
 {
 	return btf_type_is_int(t) || btf_type_is_enum(t);
@@ -184,6 +231,11 @@ static inline u16 btf_type_vlen(const struct btf_type *t)
 	return BTF_INFO_VLEN(t->info);
 }
 
+static inline u16 btf_vlen(const struct btf_type *t)
+{
+	return btf_type_vlen(t);
+}
+
 static inline u16 btf_func_linkage(const struct btf_type *t)
 {
 	return BTF_INFO_VLEN(t->info);
@@ -208,11 +260,40 @@ static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type,
 					   : 0;
 }
 
+static inline struct btf_member *btf_members(const struct btf_type *t)
+{
+	return (struct btf_member *)(t + 1);
+}
+
+static inline u32 btf_member_bit_offset(const struct btf_type *t, u32 member_idx)
+{
+	const struct btf_member *m = btf_members(t) + member_idx;
+
+	return __btf_member_bit_offset(t, m);
+}
+
+static inline u32 btf_member_bitfield_size(const struct btf_type *t, u32 member_idx)
+{
+	const struct btf_member *m = btf_members(t) + member_idx;
+
+	return __btf_member_bitfield_size(t, m);
+}
+
 static inline const struct btf_member *btf_type_member(const struct btf_type *t)
 {
 	return (const struct btf_member *)(t + 1);
 }
 
+static inline struct btf_array *btf_array(const struct btf_type *t)
+{
+	return (struct btf_array *)(t + 1);
+}
+
+static inline struct btf_enum *btf_enum(const struct btf_type *t)
+{
+	return (struct btf_enum *)(t + 1);
+}
+
 static inline const struct btf_var_secinfo *btf_type_var_secinfo(
 		const struct btf_type *t)
 {
-- 
cgit v1.2.3


From fbd94c7afcf99c9f3b1ba1168657ecc428eb2c8d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:28 -0800
Subject: bpf: Pass a set of bpf_core_relo-s to prog_load command.

struct bpf_core_relo is generated by llvm and processed by libbpf.
It's a de-facto uapi.
With CO-RE in the kernel the struct bpf_core_relo becomes uapi de-jure.
Add an ability to pass a set of 'struct bpf_core_relo' to prog_load command
and let the kernel perform CO-RE relocations.

Note the struct bpf_line_info and struct bpf_func_info have the same
layout when passed from LLVM to libbpf and from libbpf to the kernel
except "insn_off" fields means "byte offset" when LLVM generates it.
Then libbpf converts it to "insn index" to pass to the kernel.
The struct bpf_core_relo's "insn_off" field is always "byte offset".

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-6-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cad0829710be..8bbf08fbab66 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1732,6 +1732,14 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
 const struct btf_func_model *
 bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
 			 const struct bpf_insn *insn);
+struct bpf_core_ctx {
+	struct bpf_verifier_log *log;
+	const struct btf *btf;
+};
+
+int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
+		   int relo_idx, void *insn);
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
-- 
cgit v1.2.3


From d9847eb8be3d895b2b5f514fdf3885d47a0b92a2 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 22 Nov 2021 20:17:40 +0530
Subject: bpf: Make CONFIG_DEBUG_INFO_BTF depend upon CONFIG_BPF_SYSCALL

Vinicius Costa Gomes reported [0] that build fails when
CONFIG_DEBUG_INFO_BTF is enabled and CONFIG_BPF_SYSCALL is disabled.
This leads to btf.c not being compiled, and then no symbol being present
in vmlinux for the declarations in btf.h. Since BTF is not useful
without enabling BPF subsystem, disallow this combination.

However, theoretically disabling both now could still fail, as the
symbol for kfunc_btf_id_list variables is not available. This isn't a
problem as the compiler usually optimizes the whole register/unregister
call, but at lower optimization levels it can fail the build in linking
stage.

Fix that by adding dummy variables so that modules taking address of
them still work, but the whole thing is a noop.

  [0]: https://lore.kernel.org/bpf/20211110205418.332403-1-vinicius.gomes@intel.com

Fixes: 14f267d95fe4 ("bpf: btf: Introduce helpers for dynamic BTF set registration")
Reported-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20211122144742.477787-2-memxor@gmail.com
---
 include/linux/btf.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 203eef993d76..0e1b6281fd8f 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -245,7 +245,10 @@ struct kfunc_btf_id_set {
 	struct module *owner;
 };
 
-struct kfunc_btf_id_list;
+struct kfunc_btf_id_list {
+	struct list_head list;
+	struct mutex mutex;
+};
 
 #ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
@@ -254,6 +257,9 @@ void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
 				 struct kfunc_btf_id_set *s);
 bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
 			      struct module *owner);
+
+extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list;
+extern struct kfunc_btf_id_list prog_test_kfunc_list;
 #else
 static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
 					     struct kfunc_btf_id_set *s)
@@ -268,13 +274,13 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist,
 {
 	return false;
 }
+
+static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused;
+static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused;
 #endif
 
 #define DEFINE_KFUNC_BTF_ID_SET(set, name)                                     \
 	struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set),     \
 					 THIS_MODULE }
 
-extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list;
-extern struct kfunc_btf_id_list prog_test_kfunc_list;
-
 #endif
-- 
cgit v1.2.3


From b247f32aecad09e6cf7edff7739e6f2c9dc5fca9 Mon Sep 17 00:00:00 2001
From: Avihai Horon <avihaih@nvidia.com>
Date: Thu, 28 Oct 2021 16:03:06 +0300
Subject: net/mlx5: Dynamically resize flow counters query buffer

The flow counters bulk query buffer is allocated once during
mlx5_fc_init_stats(). For PFs and VFs this buffer usually takes a little
more than 512KB of memory, which is aligned to the next power of 2, to
1MB. For SFs, this buffer is reduced and takes around 128 Bytes.

The buffer size determines the maximum number of flow counters that
can be queried at a time. Thus, having a bigger buffer can improve
performance for users that need to query many flow counters.

There are cases that don't use many flow counters and don't need a big
buffer (e.g. SFs, VFs). Since this size is critical with large scale,
in these cases the buffer size should be reduced.

In order to reduce memory consumption while maintaining query
performance, change the query buffer's allocation scheme to the
following:
- First allocate the buffer with small initial size.
- If the number of counters surpasses the initial size, resize the
  buffer to the maximum size.

The buffer only grows and isn't shrank, because users with many flow
counters don't care about the buffer size and we don't want to add
resize overhead if the current number of counters drops.

This solution is preferable to the current one, which is less accurate
and only addresses SFs.

Signed-off-by: Avihai Horon <avihaih@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a623ec635947..78655d8d13a7 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -478,6 +478,10 @@ struct mlx5_fc_stats {
 	unsigned long next_query;
 	unsigned long sampling_interval; /* jiffies */
 	u32 *bulk_query_out;
+	int bulk_query_len;
+	size_t num_counters;
+	bool bulk_query_alloc_failed;
+	unsigned long next_bulk_query_alloc;
 	struct mlx5_fc_pool fc_pool;
 };
 
-- 
cgit v1.2.3


From e2748ad5257754a47376e28c0f9dda4f5c1e5ca3 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 2 Nov 2021 16:02:00 -0600
Subject: mtd: remove unused header file <linux/mtd/latch-addr-flash.h>

Commit d24dbd7541ff ("mtd: maps: Get rid of the latch-addr-flash driver")
removed the last user of <linux/mtd/latch-addr-flash.h> but left the header
file behind.  Nothing uses this file, delete it now.

Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Vignesh Raghavendra <vigneshr@ti.com>
Cc: linux-mtd@lists.infradead.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20211102220203.940290-7-corbet@lwn.net
---
 include/linux/mtd/latch-addr-flash.h | 29 -----------------------------
 1 file changed, 29 deletions(-)
 delete mode 100644 include/linux/mtd/latch-addr-flash.h

(limited to 'include/linux')

diff --git a/include/linux/mtd/latch-addr-flash.h b/include/linux/mtd/latch-addr-flash.h
deleted file mode 100644
index e94b8e128074..000000000000
--- a/include/linux/mtd/latch-addr-flash.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Interface for NOR flash driver whose high address lines are latched
- *
- * Copyright © 2008 MontaVista Software, Inc. <source@mvista.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-#ifndef __LATCH_ADDR_FLASH__
-#define __LATCH_ADDR_FLASH__
-
-struct map_info;
-struct mtd_partition;
-
-struct latch_addr_flash_data {
-	unsigned int		width;
-	unsigned int		size;
-
-	int			(*init)(void *data, int cs);
-	void			(*done)(void *data);
-	void			(*set_window)(unsigned long offset, void *data);
-	void			*data;
-
-	unsigned int		nr_parts;
-	struct mtd_partition	*parts;
-};
-
-#endif
-- 
cgit v1.2.3


From 2c4dcd7fd57b20a21b65da04d89c38a7217d79cf Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 29 Nov 2021 14:03:07 +0100
Subject: topology/sysfs: export die attributes only if an architectures has
 support

The die_id and die_cpus topology sysfs attributes have been added with
commit 0e344d8c709f ("cpu/topology: Export die_id") and commit
2e4c54dac7b3 ("topology: Create core_cpus and die_cpus sysfs attributes").

While they are currently only used and useful for x86 they are still
present with bogus default values for all architectures. Instead of
enforcing such new sysfs attributes to all architectures, make them
only optional visible if an architecture opts in by defining both the
topology_die_id and topology_die_cpumask attributes.

This is similar to what was done when the book and drawer topology
levels were introduced: avoid useless and therefore confusing sysfs
attributes for architectures which cannot make use of them.

This should not break any existing applications, since this is a
rather new interface and applications should be able to handle also
older kernel versions without such attributes - besides that they
contain only useful information for x86.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Link: https://lore.kernel.org/r/20211129130309.3256168-2-hca@linux.ibm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/topology.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 0b3704ad13c8..8d1bdae76230 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -180,6 +180,10 @@ static inline int cpu_to_mem(int cpu)
 
 #endif	/* [!]CONFIG_HAVE_MEMORYLESS_NODES */
 
+#if defined(topology_die_id) && defined(topology_die_cpumask)
+#define TOPOLOGY_DIE_SYSFS
+#endif
+
 #ifndef topology_physical_package_id
 #define topology_physical_package_id(cpu)	((void)(cpu), -1)
 #endif
-- 
cgit v1.2.3


From e795707703b32fecdd7467afcc33ff1e92416c05 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 29 Nov 2021 14:03:08 +0100
Subject: topology/sysfs: export cluster attributes only if an architectures
 has support

The cluster_id and cluster_cpus topology sysfs attributes have been
added with commit c5e22feffdd7 ("topology: Represent clusters of CPUs
within a die").

They are currently only used for x86, arm64, and riscv (via generic
arch topology), however they are still present with bogus default
values for all other architectures. Instead of enforcing such new
sysfs attributes to all architectures, make them only optional visible
if an architecture opts in by defining both the topology_cluster_id
and topology_cluster_cpumask attributes.

This is similar to what was done when the book and drawer topology
levels were introduced: avoid useless and therefore confusing sysfs
attributes for architectures which cannot make use of them.

This should not break any existing applications, since this is a
new interface introduced with the v5.16 merge window.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Link: https://lore.kernel.org/r/20211129130309.3256168-3-hca@linux.ibm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/topology.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 8d1bdae76230..d52be69037db 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -183,6 +183,9 @@ static inline int cpu_to_mem(int cpu)
 #if defined(topology_die_id) && defined(topology_die_cpumask)
 #define TOPOLOGY_DIE_SYSFS
 #endif
+#if defined(topology_cluster_id) && defined(topology_cluster_cpumask)
+#define TOPOLOGY_CLUSTER_SYSFS
+#endif
 
 #ifndef topology_physical_package_id
 #define topology_physical_package_id(cpu)	((void)(cpu), -1)
-- 
cgit v1.2.3


From f1045056c726440469d89d23c13734bcd6c0d15b Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 29 Nov 2021 14:03:09 +0100
Subject: topology/sysfs: rework book and drawer topology ifdefery

Provide default defines for the topology_book_[id|cpumask] and
topology_drawer_[id|cpumask] macros just like for each other topology
level.
This way all topology levels are handled in a similar way. Still the
the book and drawer levels are only used on s390, and also the sysfs
attributes are only created on s390. However other architectures may
opt in if wanted.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Link: https://lore.kernel.org/r/20211129130309.3256168-4-hca@linux.ibm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/topology.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index d52be69037db..a6e201758ae9 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -186,6 +186,12 @@ static inline int cpu_to_mem(int cpu)
 #if defined(topology_cluster_id) && defined(topology_cluster_cpumask)
 #define TOPOLOGY_CLUSTER_SYSFS
 #endif
+#if defined(topology_book_id) && defined(topology_book_cpumask)
+#define TOPOLOGY_BOOK_SYSFS
+#endif
+#if defined(topology_drawer_id) && defined(topology_drawer_cpumask)
+#define TOPOLOGY_DRAWER_SYSFS
+#endif
 
 #ifndef topology_physical_package_id
 #define topology_physical_package_id(cpu)	((void)(cpu), -1)
@@ -199,6 +205,12 @@ static inline int cpu_to_mem(int cpu)
 #ifndef topology_core_id
 #define topology_core_id(cpu)			((void)(cpu), 0)
 #endif
+#ifndef topology_book_id
+#define topology_book_id(cpu)			((void)(cpu), -1)
+#endif
+#ifndef topology_drawer_id
+#define topology_drawer_id(cpu)			((void)(cpu), -1)
+#endif
 #ifndef topology_sibling_cpumask
 #define topology_sibling_cpumask(cpu)		cpumask_of(cpu)
 #endif
@@ -211,6 +223,12 @@ static inline int cpu_to_mem(int cpu)
 #ifndef topology_die_cpumask
 #define topology_die_cpumask(cpu)		cpumask_of(cpu)
 #endif
+#ifndef topology_book_cpumask
+#define topology_book_cpumask(cpu)		cpumask_of(cpu)
+#endif
+#ifndef topology_drawer_cpumask
+#define topology_drawer_cpumask(cpu)		cpumask_of(cpu)
+#endif
 
 #if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
 static inline const struct cpumask *cpu_smt_mask(int cpu)
-- 
cgit v1.2.3


From c7fdb2404f66131bc9c22e06f712717288826487 Mon Sep 17 00:00:00 2001
From: Abhyuday Godhasara <abhyuday.godhasara@xilinx.com>
Date: Sun, 28 Nov 2021 23:02:14 -0800
Subject: drivers: soc: xilinx: add xilinx event management driver

Xilinx event management driver provides an interface to subscribe or
unsubscribe for the event/callback supported by firmware. An agent can use
this driver to register for Error Event, Device Event and Suspend callback.
This driver only allows one agent per event to do registration. Driver will
return an error in case of multiple registration for the same event.

This driver gets notification from firmware through TF-A as SGI. During
initialization, event manager driver register handler for SGI used for
notification. It also provides SGI number info to TF-A by using
IOCTL_REGISTER_SGI call to TF-A.

After receiving notification from firmware, the driver makes an SMC call to
TF-A to get IPI data. From the IPI data provided by TF-A, event manager
identified the cause of event and forward that event/callback notification
to the respective subscribed driver. After this, in case of Error Event,
driver performs unregistration as firmware expecting from agent to do
re-registration if the agent wants to get notified on the second occurrence
of an error event.

Add new IOCTL id IOCTL_REGISTER_SGI = 25 which is used to register SGI on
TF-A.

Older firmware doesn't have all required support for event handling which
is required by the event manager driver. So add check for the register
notifier version in the event manager driver.

Xilinx event management driver provides support to subscribe for multiple
error events with the use of Event Mask in a single call of
xlnx_register_event(). Agent driver can provide 'Event' parameter value as
ORed of multiple event masks to register single callback for multiple
events. For example, to register callback for event=0x1 and event=0x2 for
the given node, agent can provide event=0x3 (0x1 | 0x2). It is not possible
to register multiple events for different nodes in a single registration
call.

Also provide support to receive multiple error events as in single
notification from firmware and then forward it to subscribed drivers via
registered callback one by one.

Acked-by: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: Tejas Patel <tejas.patel@xilinx.com>
Signed-off-by: Rajan Vaja <rajan.vaja@xilinx.com>
Signed-off-by: Abhyuday Godhasara <abhyuday.godhasara@xilinx.com>
Link: https://lore.kernel.org/r/20211129070216.30253-2-abhyuday.godhasara@xilinx.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-event-manager.h | 36 +++++++++++++++++++++++++++++
 include/linux/firmware/xlnx-zynqmp.h        |  2 ++
 2 files changed, 38 insertions(+)
 create mode 100644 include/linux/firmware/xlnx-event-manager.h

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-event-manager.h b/include/linux/firmware/xlnx-event-manager.h
new file mode 100644
index 000000000000..3f87c4929d21
--- /dev/null
+++ b/include/linux/firmware/xlnx-event-manager.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _FIRMWARE_XLNX_EVENT_MANAGER_H_
+#define _FIRMWARE_XLNX_EVENT_MANAGER_H_
+
+#include <linux/firmware/xlnx-zynqmp.h>
+
+#define CB_MAX_PAYLOAD_SIZE	(4U) /*In payload maximum 32bytes */
+
+/************************** Exported Function *****************************/
+
+typedef void (*event_cb_func_t)(const u32 *payload, void *data);
+
+#if IS_REACHABLE(CONFIG_XLNX_EVENT_MANAGER)
+int xlnx_register_event(const enum pm_api_cb_id cb_type, const u32 node_id,
+			const u32 event, const bool wake,
+			event_cb_func_t cb_fun, void *data);
+
+int xlnx_unregister_event(const enum pm_api_cb_id cb_type, const u32 node_id,
+			  const u32 event, event_cb_func_t cb_fun);
+#else
+static inline int xlnx_register_event(const enum pm_api_cb_id cb_type, const u32 node_id,
+				      const u32 event, const bool wake,
+				      event_cb_func_t cb_fun, void *data)
+{
+	return -ENODEV;
+}
+
+static inline int xlnx_unregister_event(const enum pm_api_cb_id cb_type, const u32 node_id,
+					 const u32 event, event_cb_func_t cb_fun)
+{
+	return -ENODEV;
+}
+#endif
+
+#endif /* _FIRMWARE_XLNX_EVENT_MANAGER_H_ */
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 077e894bb340..907cb01890cf 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -141,6 +141,8 @@ enum pm_ioctl_id {
 	/* Set healthy bit value */
 	IOCTL_SET_BOOT_HEALTH_STATUS = 17,
 	IOCTL_OSPI_MUX_SELECT = 21,
+	/* Register SGI to ATF */
+	IOCTL_REGISTER_SGI = 25,
 };
 
 enum pm_query_id {
-- 
cgit v1.2.3


From 14866a7db8da1f61fb6135c461b733694eea9580 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 1 Dec 2021 20:43:03 -0800
Subject: Documentation/auxiliary_bus: Add example code for
 module_auxiliary_driver()

Add an example code snipit to the module_auxiliary_driver()
documentation which is consistent with the other example code in the
elsewhere in the documentation.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20211202044305.4006853-6-ira.weiny@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/auxiliary_bus.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h
index fc51d45f106b..605b27aab693 100644
--- a/include/linux/auxiliary_bus.h
+++ b/include/linux/auxiliary_bus.h
@@ -66,6 +66,10 @@ void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv);
  * Helper macro for auxiliary drivers which do not do anything special in
  * module init/exit. This eliminates a lot of boilerplate. Each module may only
  * use this macro once, and calling it replaces module_init() and module_exit()
+ *
+ * .. code-block:: c
+ *
+ *	module_auxiliary_driver(my_drv);
  */
 #define module_auxiliary_driver(__auxiliary_driver) \
 	module_driver(__auxiliary_driver, auxiliary_driver_register, auxiliary_driver_unregister)
-- 
cgit v1.2.3


From e1b5186810cc7d4ec60447032636b8e6772dbbc6 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 1 Dec 2021 20:43:05 -0800
Subject: Documentation/auxiliary_bus: Move the text into the code

The code and documentation are more difficult to maintain when kept
separately.  This is further compounded when the standard structure
documentation infrastructure is not used.

Move the documentation into the code, use the standard documentation
infrastructure, add current documented functions, and reference the text
in the rst file.

Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20211202044305.4006853-8-ira.weiny@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/auxiliary_bus.h | 160 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h
index 605b27aab693..e6d8b5c16226 100644
--- a/include/linux/auxiliary_bus.h
+++ b/include/linux/auxiliary_bus.h
@@ -11,12 +11,172 @@
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 
+/**
+ * DOC: DEVICE_LIFESPAN
+ *
+ * The registering driver is the entity that allocates memory for the
+ * auxiliary_device and registers it on the auxiliary bus.  It is important to
+ * note that, as opposed to the platform bus, the registering driver is wholly
+ * responsible for the management of the memory used for the device object.
+ *
+ * To be clear the memory for the auxiliary_device is freed in the release()
+ * callback defined by the registering driver.  The registering driver should
+ * only call auxiliary_device_delete() and then auxiliary_device_uninit() when
+ * it is done with the device.  The release() function is then automatically
+ * called if and when other code releases their reference to the devices.
+ *
+ * A parent object, defined in the shared header file, contains the
+ * auxiliary_device.  It also contains a pointer to the shared object(s), which
+ * also is defined in the shared header.  Both the parent object and the shared
+ * object(s) are allocated by the registering driver.  This layout allows the
+ * auxiliary_driver's registering module to perform a container_of() call to go
+ * from the pointer to the auxiliary_device, that is passed during the call to
+ * the auxiliary_driver's probe function, up to the parent object, and then
+ * have access to the shared object(s).
+ *
+ * The memory for the shared object(s) must have a lifespan equal to, or
+ * greater than, the lifespan of the memory for the auxiliary_device.  The
+ * auxiliary_driver should only consider that the shared object is valid as
+ * long as the auxiliary_device is still registered on the auxiliary bus.  It
+ * is up to the registering driver to manage (e.g. free or keep available) the
+ * memory for the shared object beyond the life of the auxiliary_device.
+ *
+ * The registering driver must unregister all auxiliary devices before its own
+ * driver.remove() is completed.  An easy way to ensure this is to use the
+ * devm_add_action_or_reset() call to register a function against the parent
+ * device which unregisters the auxiliary device object(s).
+ *
+ * Finally, any operations which operate on the auxiliary devices must continue
+ * to function (if only to return an error) after the registering driver
+ * unregisters the auxiliary device.
+ */
+
+/**
+ * struct auxiliary_device - auxiliary device object.
+ * @dev: Device,
+ *       The release and parent fields of the device structure must be filled
+ *       in
+ * @name: Match name found by the auxiliary device driver,
+ * @id: unique identitier if multiple devices of the same name are exported,
+ *
+ * An auxiliary_device represents a part of its parent device's functionality.
+ * It is given a name that, combined with the registering drivers
+ * KBUILD_MODNAME, creates a match_name that is used for driver binding, and an
+ * id that combined with the match_name provide a unique name to register with
+ * the bus subsystem.  For example, a driver registering an auxiliary device is
+ * named 'foo_mod.ko' and the subdevice is named 'foo_dev'.  The match name is
+ * therefore 'foo_mod.foo_dev'.
+ *
+ * Registering an auxiliary_device is a three-step process.
+ *
+ * First, a 'struct auxiliary_device' needs to be defined or allocated for each
+ * sub-device desired.  The name, id, dev.release, and dev.parent fields of
+ * this structure must be filled in as follows.
+ *
+ * The 'name' field is to be given a name that is recognized by the auxiliary
+ * driver.  If two auxiliary_devices with the same match_name, eg
+ * "foo_mod.foo_dev", are registered onto the bus, they must have unique id
+ * values (e.g. "x" and "y") so that the registered devices names are
+ * "foo_mod.foo_dev.x" and "foo_mod.foo_dev.y".  If match_name + id are not
+ * unique, then the device_add fails and generates an error message.
+ *
+ * The auxiliary_device.dev.type.release or auxiliary_device.dev.release must
+ * be populated with a non-NULL pointer to successfully register the
+ * auxiliary_device.  This release call is where resources associated with the
+ * auxiliary device must be free'ed.  Because once the device is placed on the
+ * bus the parent driver can not tell what other code may have a reference to
+ * this data.
+ *
+ * The auxiliary_device.dev.parent should be set.  Typically to the registering
+ * drivers device.
+ *
+ * Second, call auxiliary_device_init(), which checks several aspects of the
+ * auxiliary_device struct and performs a device_initialize().  After this step
+ * completes, any error state must have a call to auxiliary_device_uninit() in
+ * its resolution path.
+ *
+ * The third and final step in registering an auxiliary_device is to perform a
+ * call to auxiliary_device_add(), which sets the name of the device and adds
+ * the device to the bus.
+ *
+ * .. code-block:: c
+ *
+ *      #define MY_DEVICE_NAME "foo_dev"
+ *
+ *      ...
+ *
+ *	struct auxiliary_device *my_aux_dev = my_aux_dev_alloc(xxx);
+ *
+ *	// Step 1:
+ *	my_aux_dev->name = MY_DEVICE_NAME;
+ *	my_aux_dev->id = my_unique_id_alloc(xxx);
+ *	my_aux_dev->dev.release = my_aux_dev_release;
+ *	my_aux_dev->dev.parent = my_dev;
+ *
+ *	// Step 2:
+ *	if (auxiliary_device_init(my_aux_dev))
+ *		goto fail;
+ *
+ *	// Step 3:
+ *	if (auxiliary_device_add(my_aux_dev)) {
+ *		auxiliary_device_uninit(my_aux_dev);
+ *		goto fail;
+ *	}
+ *
+ *	...
+ *
+ *
+ * Unregistering an auxiliary_device is a two-step process to mirror the
+ * register process.  First call auxiliary_device_delete(), then call
+ * auxiliary_device_uninit().
+ *
+ * .. code-block:: c
+ *
+ *         auxiliary_device_delete(my_dev->my_aux_dev);
+ *         auxiliary_device_uninit(my_dev->my_aux_dev);
+ */
 struct auxiliary_device {
 	struct device dev;
 	const char *name;
 	u32 id;
 };
 
+/**
+ * struct auxiliary_driver - Definition of an auxiliary bus driver
+ * @probe: Called when a matching device is added to the bus.
+ * @remove: Called when device is removed from the bus.
+ * @shutdown: Called at shut-down time to quiesce the device.
+ * @suspend: Called to put the device to sleep mode. Usually to a power state.
+ * @resume: Called to bring a device from sleep mode.
+ * @name: Driver name.
+ * @driver: Core driver structure.
+ * @id_table: Table of devices this driver should match on the bus.
+ *
+ * Auxiliary drivers follow the standard driver model convention, where
+ * discovery/enumeration is handled by the core, and drivers provide probe()
+ * and remove() methods. They support power management and shutdown
+ * notifications using the standard conventions.
+ *
+ * Auxiliary drivers register themselves with the bus by calling
+ * auxiliary_driver_register(). The id_table contains the match_names of
+ * auxiliary devices that a driver can bind with.
+ *
+ * .. code-block:: c
+ *
+ *         static const struct auxiliary_device_id my_auxiliary_id_table[] = {
+ *		   { .name = "foo_mod.foo_dev" },
+ *                 {},
+ *         };
+ *
+ *         MODULE_DEVICE_TABLE(auxiliary, my_auxiliary_id_table);
+ *
+ *         struct auxiliary_driver my_drv = {
+ *                 .name = "myauxiliarydrv",
+ *                 .id_table = my_auxiliary_id_table,
+ *                 .probe = my_drv_probe,
+ *                 .remove = my_drv_remove
+ *         };
+ */
 struct auxiliary_driver {
 	int (*probe)(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id);
 	void (*remove)(struct auxiliary_device *auxdev);
-- 
cgit v1.2.3


From bb49e9e730c2906a958eee273a7819f401543d6c Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 3 Dec 2021 12:16:58 +0100
Subject: fs: add is_idmapped_mnt() helper

Multiple places open-code the same check to determine whether a given
mount is idmapped. Introduce a simple helper function that can be used
instead. This allows us to get rid of the fragile open-coding. We will
later change the check that is used to determine whether a given mount
is idmapped. Introducing a helper allows us to do this in a single
place instead of doing it for multiple places.

Link: https://lore.kernel.org/r/20211123114227.3124056-2-brauner@kernel.org (v1)
Link: https://lore.kernel.org/r/20211130121032.3753852-2-brauner@kernel.org (v2)
Link: https://lore.kernel.org/r/20211203111707.3901969-2-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbf812ce89a8..06cbefd76de7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2724,6 +2724,20 @@ static inline struct user_namespace *file_mnt_user_ns(struct file *file)
 {
 	return mnt_user_ns(file->f_path.mnt);
 }
+
+/**
+ * is_idmapped_mnt - check whether a mount is mapped
+ * @mnt: the mount to check
+ *
+ * If @mnt has an idmapping attached to it @mnt is mapped.
+ *
+ * Return: true if mount is mapped, false if not.
+ */
+static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
+{
+	return mnt_user_ns(mnt) != &init_user_ns;
+}
+
 extern long vfs_truncate(const struct path *, loff_t);
 int do_truncate(struct user_namespace *, struct dentry *, loff_t start,
 		unsigned int time_attrs, struct file *filp);
-- 
cgit v1.2.3


From a793d79ea3e041081cd7cbd8ee43d0b5e4914a2b Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 3 Dec 2021 12:16:59 +0100
Subject: fs: move mapping helpers

The low-level mapping helpers were so far crammed into fs.h. They are
out of place there. The fs.h header should just contain the higher-level
mapping helpers that interact directly with vfs objects such as struct
super_block or struct inode and not the bare mapping helpers. Similarly,
only vfs and specific fs code shall interact with low-level mapping
helpers. And so they won't be made accessible automatically through
regular {g,u}id helpers.

Link: https://lore.kernel.org/r/20211123114227.3124056-3-brauner@kernel.org (v1)
Link: https://lore.kernel.org/r/20211130121032.3753852-3-brauner@kernel.org (v2)
Link: https://lore.kernel.org/r/20211203111707.3901969-3-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h            |  91 +------------------------------------
 include/linux/mnt_idmapping.h | 101 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 90 deletions(-)
 create mode 100644 include/linux/mnt_idmapping.h

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 06cbefd76de7..b3bcb2129699 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -41,6 +41,7 @@
 #include <linux/stddef.h>
 #include <linux/mount.h>
 #include <linux/cred.h>
+#include <linux/mnt_idmapping.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1624,34 +1625,6 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
 	inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
 }
 
-/**
- * kuid_into_mnt - map a kuid down into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kuid: kuid to be mapped
- *
- * Return: @kuid mapped according to @mnt_userns.
- * If @kuid has no mapping INVALID_UID is returned.
- */
-static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns,
-				   kuid_t kuid)
-{
-	return make_kuid(mnt_userns, __kuid_val(kuid));
-}
-
-/**
- * kgid_into_mnt - map a kgid down into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kgid: kgid to be mapped
- *
- * Return: @kgid mapped according to @mnt_userns.
- * If @kgid has no mapping INVALID_GID is returned.
- */
-static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns,
-				   kgid_t kgid)
-{
-	return make_kgid(mnt_userns, __kgid_val(kgid));
-}
-
 /**
  * i_uid_into_mnt - map an inode's i_uid down into a mnt_userns
  * @mnt_userns: user namespace of the mount the inode was found from
@@ -1680,68 +1653,6 @@ static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
 	return kgid_into_mnt(mnt_userns, inode->i_gid);
 }
 
-/**
- * kuid_from_mnt - map a kuid up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kuid: kuid to be mapped
- *
- * Return: @kuid mapped up according to @mnt_userns.
- * If @kuid has no mapping INVALID_UID is returned.
- */
-static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns,
-				   kuid_t kuid)
-{
-	return KUIDT_INIT(from_kuid(mnt_userns, kuid));
-}
-
-/**
- * kgid_from_mnt - map a kgid up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kgid: kgid to be mapped
- *
- * Return: @kgid mapped up according to @mnt_userns.
- * If @kgid has no mapping INVALID_GID is returned.
- */
-static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
-				   kgid_t kgid)
-{
-	return KGIDT_INIT(from_kgid(mnt_userns, kgid));
-}
-
-/**
- * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- *
- * Use this helper to initialize a new vfs or filesystem object based on
- * the caller's fsuid. A common example is initializing the i_uid field of
- * a newly allocated inode triggered by a creation event such as mkdir or
- * O_CREAT. Other examples include the allocation of quotas for a specific
- * user.
- *
- * Return: the caller's current fsuid mapped up according to @mnt_userns.
- */
-static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns)
-{
-	return kuid_from_mnt(mnt_userns, current_fsuid());
-}
-
-/**
- * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- *
- * Use this helper to initialize a new vfs or filesystem object based on
- * the caller's fsgid. A common example is initializing the i_gid field of
- * a newly allocated inode triggered by a creation event such as mkdir or
- * O_CREAT. Other examples include the allocation of quotas for a specific
- * user.
- *
- * Return: the caller's current fsgid mapped up according to @mnt_userns.
- */
-static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns)
-{
-	return kgid_from_mnt(mnt_userns, current_fsgid());
-}
-
 /**
  * inode_fsuid_set - initialize inode's i_uid field with callers fsuid
  * @inode: inode to initialize
diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
new file mode 100644
index 000000000000..47c7811fadfe
--- /dev/null
+++ b/include/linux/mnt_idmapping.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MNT_IDMAPPING_H
+#define _LINUX_MNT_IDMAPPING_H
+
+#include <linux/types.h>
+#include <linux/uidgid.h>
+
+struct user_namespace;
+extern struct user_namespace init_user_ns;
+
+/**
+ * kuid_into_mnt - map a kuid down into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ * @kuid: kuid to be mapped
+ *
+ * Return: @kuid mapped according to @mnt_userns.
+ * If @kuid has no mapping INVALID_UID is returned.
+ */
+static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns,
+				   kuid_t kuid)
+{
+	return make_kuid(mnt_userns, __kuid_val(kuid));
+}
+
+/**
+ * kgid_into_mnt - map a kgid down into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ * @kgid: kgid to be mapped
+ *
+ * Return: @kgid mapped according to @mnt_userns.
+ * If @kgid has no mapping INVALID_GID is returned.
+ */
+static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns,
+				   kgid_t kgid)
+{
+	return make_kgid(mnt_userns, __kgid_val(kgid));
+}
+
+/**
+ * kuid_from_mnt - map a kuid up into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ * @kuid: kuid to be mapped
+ *
+ * Return: @kuid mapped up according to @mnt_userns.
+ * If @kuid has no mapping INVALID_UID is returned.
+ */
+static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns,
+				   kuid_t kuid)
+{
+	return KUIDT_INIT(from_kuid(mnt_userns, kuid));
+}
+
+/**
+ * kgid_from_mnt - map a kgid up into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ * @kgid: kgid to be mapped
+ *
+ * Return: @kgid mapped up according to @mnt_userns.
+ * If @kgid has no mapping INVALID_GID is returned.
+ */
+static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
+				   kgid_t kgid)
+{
+	return KGIDT_INIT(from_kgid(mnt_userns, kgid));
+}
+
+/**
+ * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ *
+ * Use this helper to initialize a new vfs or filesystem object based on
+ * the caller's fsuid. A common example is initializing the i_uid field of
+ * a newly allocated inode triggered by a creation event such as mkdir or
+ * O_CREAT. Other examples include the allocation of quotas for a specific
+ * user.
+ *
+ * Return: the caller's current fsuid mapped up according to @mnt_userns.
+ */
+static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns)
+{
+	return kuid_from_mnt(mnt_userns, current_fsuid());
+}
+
+/**
+ * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ *
+ * Use this helper to initialize a new vfs or filesystem object based on
+ * the caller's fsgid. A common example is initializing the i_gid field of
+ * a newly allocated inode triggered by a creation event such as mkdir or
+ * O_CREAT. Other examples include the allocation of quotas for a specific
+ * user.
+ *
+ * Return: the caller's current fsgid mapped up according to @mnt_userns.
+ */
+static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns)
+{
+	return kgid_from_mnt(mnt_userns, current_fsgid());
+}
+
+#endif /* _LINUX_MNT_IDMAPPING_H */
-- 
cgit v1.2.3


From 476860b3eb4a50958243158861d5340066df5af2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 3 Dec 2021 12:17:00 +0100
Subject: fs: tweak fsuidgid_has_mapping()

If the caller's fs{g,u}id aren't mapped in the mount's idmapping we can
return early and skip the check whether the mapped fs{g,u}id also have a
mapping in the filesystem's idmapping. If the fs{g,u}id aren't mapped in
the mount's idmapping they consequently can't be mapped in the
filesystem's idmapping. So there's no point in checking that.

Link: https://lore.kernel.org/r/20211123114227.3124056-4-brauner@kernel.org (v1)
Link: https://lore.kernel.org/r/20211130121032.3753852-4-brauner@kernel.org (v2)
Link: https://lore.kernel.org/r/20211203111707.3901969-4-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3bcb2129699..db5ee15e36b1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1695,10 +1695,18 @@ static inline void inode_fsgid_set(struct inode *inode,
 static inline bool fsuidgid_has_mapping(struct super_block *sb,
 					struct user_namespace *mnt_userns)
 {
-	struct user_namespace *s_user_ns = sb->s_user_ns;
+	struct user_namespace *fs_userns = sb->s_user_ns;
+	kuid_t kuid;
+	kgid_t kgid;
 
-	return kuid_has_mapping(s_user_ns, mapped_fsuid(mnt_userns)) &&
-	       kgid_has_mapping(s_user_ns, mapped_fsgid(mnt_userns));
+	kuid = mapped_fsuid(mnt_userns);
+	if (!uid_valid(kuid))
+		return false;
+	kgid = mapped_fsgid(mnt_userns);
+	if (!gid_valid(kgid))
+		return false;
+	return kuid_has_mapping(fs_userns, kuid) &&
+	       kgid_has_mapping(fs_userns, kgid);
 }
 
 extern struct timespec64 current_time(struct inode *inode);
-- 
cgit v1.2.3


From 1ac2a4104968e0a60b4b3572216a92aab5c1b025 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 3 Dec 2021 12:17:01 +0100
Subject: fs: account for filesystem mappings

Currently we only support idmapped mounts for filesystems mounted
without an idmapping. This was a conscious decision mentioned in
multiple places (cf. e.g. [1]).

As explained at length in [3] it is perfectly fine to extend support for
idmapped mounts to filesystem's mounted with an idmapping should the
need arise. The need has been there for some time now. Various container
projects in userspace need this to run unprivileged and nested
unprivileged containers (cf. [2]).

Before we can port any filesystem that is mountable with an idmapping to
support idmapped mounts we need to first extend the mapping helpers to
account for the filesystem's idmapping. This again, is explained at
length in our documentation at [3] but I'll give an overview here again.

Currently, the low-level mapping helpers implement the remapping
algorithms described in [3] in a simplified manner. Because we could
rely on the fact that all filesystems supporting idmapped mounts are
mounted without an idmapping the translation step from or into the
filesystem idmapping could be skipped.

In order to support idmapped mounts of filesystem's mountable with an
idmapping the translation step we were able to skip before cannot be
skipped anymore. A filesystem mounted with an idmapping is very likely
to not use an identity mapping and will instead use a non-identity
mapping. So the translation step from or into the filesystem's idmapping
in the remapping algorithm cannot be skipped for such filesystems. More
details with examples can be found in [3].

This patch adds a few new and prepares some already existing low-level
mapping helpers to perform the full translation algorithm explained in
[3]. The low-level helpers can be written in a way that they only
perform the additional translation step when the filesystem is indeed
mounted with an idmapping.

If the low-level helpers detect that they are not dealing with an
idmapped mount they can simply return the relevant k{g,u}id unchanged;
no remapping needs to be performed at all. The no_idmapping() helper
detects whether the shortcut can be used.

If the low-level helpers detected that they are dealing with an idmapped
mount but the underlying filesystem is mounted without an idmapping we
can rely on the previous shorcut and can continue to skip the
translation step from or into the filesystem's idmapping.

These checks guarantee that only the minimal amount of work is
performed. As before, if idmapped mounts aren't used the low-level
helpers are idempotent and no work is performed at all.

This patch adds the helpers mapped_k{g,u}id_fs() and
mapped_k{g,u}id_user(). Following patches will port all places to
replace the old k{g,u}id_into_mnt() and k{g,u}id_from_mnt() with these
two new helpers. After the conversion is done k{g,u}id_into_mnt() and
k{g,u}id_from_mnt() will be removed. This also concludes the renaming of
the mapping helpers we started in [4]. Now, all mapping helpers will
started with the "mapped_" prefix making everything nice and consistent.

The mapped_k{g,u}id_fs() helpers replace the k{g,u}id_into_mnt()
helpers. They are to be used when k{g,u}ids are to be mapped from the
vfs, e.g. from from struct inode's i_{g,u}id.  Conversely, the
mapped_k{g,u}id_user() helpers replace the k{g,u}id_from_mnt() helpers.
They are to be used when k{g,u}ids are to be written to disk, e.g. when
entering from a system call to change ownership of a file.

This patch only introduces the helpers. It doesn't yet convert the
relevant places to account for filesystem mounted with an idmapping.

[1]: commit 2ca4dcc4909d ("fs/mount_setattr: tighten permission checks")
[2]: https://github.com/containers/podman/issues/10374
[3]: Documentations/filesystems/idmappings.rst
[4]: commit a65e58e791a1 ("fs: document and rename fsid helpers")

Link: https://lore.kernel.org/r/20211123114227.3124056-5-brauner@kernel.org (v1)
Link: https://lore.kernel.org/r/20211130121032.3753852-5-brauner@kernel.org (v2)
Link: https://lore.kernel.org/r/20211203111707.3901969-5-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h            |   4 +-
 include/linux/mnt_idmapping.h | 193 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 191 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index db5ee15e36b1..57aee6ebba72 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1636,7 +1636,7 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
 static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
 				    const struct inode *inode)
 {
-	return kuid_into_mnt(mnt_userns, inode->i_uid);
+	return mapped_kuid_fs(mnt_userns, &init_user_ns, inode->i_uid);
 }
 
 /**
@@ -1650,7 +1650,7 @@ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
 static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
 				    const struct inode *inode)
 {
-	return kgid_into_mnt(mnt_userns, inode->i_gid);
+	return mapped_kgid_fs(mnt_userns, &init_user_ns, inode->i_gid);
 }
 
 /**
diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index 47c7811fadfe..60341cd33ccc 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -6,6 +6,11 @@
 #include <linux/uidgid.h>
 
 struct user_namespace;
+/*
+ * Carries the initial idmapping of 0:0:4294967295 which is an identity
+ * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is
+ * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
+ */
 extern struct user_namespace init_user_ns;
 
 /**
@@ -64,9 +69,189 @@ static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
 	return KGIDT_INIT(from_kgid(mnt_userns, kgid));
 }
 
+/**
+ * initial_idmapping - check whether this is the initial mapping
+ * @ns: idmapping to check
+ *
+ * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1,
+ * [...], 1000 to 1000 [...].
+ *
+ * Return: true if this is the initial mapping, false if not.
+ */
+static inline bool initial_idmapping(const struct user_namespace *ns)
+{
+	return ns == &init_user_ns;
+}
+
+/**
+ * no_idmapping - check whether we can skip remapping a kuid/gid
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ *
+ * This function can be used to check whether a remapping between two
+ * idmappings is required.
+ * An idmapped mount is a mount that has an idmapping attached to it that
+ * is different from the filsystem's idmapping and the initial idmapping.
+ * If the initial mapping is used or the idmapping of the mount and the
+ * filesystem are identical no remapping is required.
+ *
+ * Return: true if remapping can be skipped, false if not.
+ */
+static inline bool no_idmapping(const struct user_namespace *mnt_userns,
+				const struct user_namespace *fs_userns)
+{
+	return initial_idmapping(mnt_userns) || mnt_userns == fs_userns;
+}
+
+/**
+ * mapped_kuid_fs - map a filesystem kuid into a mnt_userns
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @kuid : kuid to be mapped
+ *
+ * Take a @kuid and remap it from @fs_userns into @mnt_userns. Use this
+ * function when preparing a @kuid to be reported to userspace.
+ *
+ * If no_idmapping() determines that this is not an idmapped mount we can
+ * simply return @kuid unchanged.
+ * If initial_idmapping() tells us that the filesystem is not mounted with an
+ * idmapping we know the value of @kuid won't change when calling
+ * from_kuid() so we can simply retrieve the value via __kuid_val()
+ * directly.
+ *
+ * Return: @kuid mapped according to @mnt_userns.
+ * If @kuid has no mapping in either @mnt_userns or @fs_userns INVALID_UID is
+ * returned.
+ */
+static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns,
+				    struct user_namespace *fs_userns,
+				    kuid_t kuid)
+{
+	uid_t uid;
+
+	if (no_idmapping(mnt_userns, fs_userns))
+		return kuid;
+	if (initial_idmapping(fs_userns))
+		uid = __kuid_val(kuid);
+	else
+		uid = from_kuid(fs_userns, kuid);
+	if (uid == (uid_t)-1)
+		return INVALID_UID;
+	return make_kuid(mnt_userns, uid);
+}
+
+/**
+ * mapped_kgid_fs - map a filesystem kgid into a mnt_userns
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @kgid : kgid to be mapped
+ *
+ * Take a @kgid and remap it from @fs_userns into @mnt_userns. Use this
+ * function when preparing a @kgid to be reported to userspace.
+ *
+ * If no_idmapping() determines that this is not an idmapped mount we can
+ * simply return @kgid unchanged.
+ * If initial_idmapping() tells us that the filesystem is not mounted with an
+ * idmapping we know the value of @kgid won't change when calling
+ * from_kgid() so we can simply retrieve the value via __kgid_val()
+ * directly.
+ *
+ * Return: @kgid mapped according to @mnt_userns.
+ * If @kgid has no mapping in either @mnt_userns or @fs_userns INVALID_GID is
+ * returned.
+ */
+static inline kgid_t mapped_kgid_fs(struct user_namespace *mnt_userns,
+				    struct user_namespace *fs_userns,
+				    kgid_t kgid)
+{
+	gid_t gid;
+
+	if (no_idmapping(mnt_userns, fs_userns))
+		return kgid;
+	if (initial_idmapping(fs_userns))
+		gid = __kgid_val(kgid);
+	else
+		gid = from_kgid(fs_userns, kgid);
+	if (gid == (gid_t)-1)
+		return INVALID_GID;
+	return make_kgid(mnt_userns, gid);
+}
+
+/**
+ * mapped_kuid_user - map a user kuid into a mnt_userns
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @kuid : kuid to be mapped
+ *
+ * Use the idmapping of @mnt_userns to remap a @kuid into @fs_userns. Use this
+ * function when preparing a @kuid to be written to disk or inode.
+ *
+ * If no_idmapping() determines that this is not an idmapped mount we can
+ * simply return @kuid unchanged.
+ * If initial_idmapping() tells us that the filesystem is not mounted with an
+ * idmapping we know the value of @kuid won't change when calling
+ * make_kuid() so we can simply retrieve the value via KUIDT_INIT()
+ * directly.
+ *
+ * Return: @kuid mapped according to @mnt_userns.
+ * If @kuid has no mapping in either @mnt_userns or @fs_userns INVALID_UID is
+ * returned.
+ */
+static inline kuid_t mapped_kuid_user(struct user_namespace *mnt_userns,
+				      struct user_namespace *fs_userns,
+				      kuid_t kuid)
+{
+	uid_t uid;
+
+	if (no_idmapping(mnt_userns, fs_userns))
+		return kuid;
+	uid = from_kuid(mnt_userns, kuid);
+	if (uid == (uid_t)-1)
+		return INVALID_UID;
+	if (initial_idmapping(fs_userns))
+		return KUIDT_INIT(uid);
+	return make_kuid(fs_userns, uid);
+}
+
+/**
+ * mapped_kgid_user - map a user kgid into a mnt_userns
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @kgid : kgid to be mapped
+ *
+ * Use the idmapping of @mnt_userns to remap a @kgid into @fs_userns. Use this
+ * function when preparing a @kgid to be written to disk or inode.
+ *
+ * If no_idmapping() determines that this is not an idmapped mount we can
+ * simply return @kgid unchanged.
+ * If initial_idmapping() tells us that the filesystem is not mounted with an
+ * idmapping we know the value of @kgid won't change when calling
+ * make_kgid() so we can simply retrieve the value via KGIDT_INIT()
+ * directly.
+ *
+ * Return: @kgid mapped according to @mnt_userns.
+ * If @kgid has no mapping in either @mnt_userns or @fs_userns INVALID_GID is
+ * returned.
+ */
+static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns,
+				      struct user_namespace *fs_userns,
+				      kgid_t kgid)
+{
+	gid_t gid;
+
+	if (no_idmapping(mnt_userns, fs_userns))
+		return kgid;
+	gid = from_kgid(mnt_userns, kgid);
+	if (gid == (gid_t)-1)
+		return INVALID_GID;
+	if (initial_idmapping(fs_userns))
+		return KGIDT_INIT(gid);
+	return make_kgid(fs_userns, gid);
+}
+
 /**
  * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
+ * @mnt_userns: the mount's idmapping
  *
  * Use this helper to initialize a new vfs or filesystem object based on
  * the caller's fsuid. A common example is initializing the i_uid field of
@@ -78,12 +263,12 @@ static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
  */
 static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns)
 {
-	return kuid_from_mnt(mnt_userns, current_fsuid());
+	return mapped_kuid_user(mnt_userns, &init_user_ns, current_fsuid());
 }
 
 /**
  * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
+ * @mnt_userns: the mount's idmapping
  *
  * Use this helper to initialize a new vfs or filesystem object based on
  * the caller's fsgid. A common example is initializing the i_gid field of
@@ -95,7 +280,7 @@ static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns)
  */
 static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns)
 {
-	return kgid_from_mnt(mnt_userns, current_fsgid());
+	return mapped_kgid_user(mnt_userns, &init_user_ns, current_fsgid());
 }
 
 #endif /* _LINUX_MNT_IDMAPPING_H */
-- 
cgit v1.2.3


From 8581fd402a0cf80b5298e3b225e7a7bd8f110e69 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 2 Dec 2021 12:34:00 -0800
Subject: treewide: Add missing includes masked by cgroup -> bpf dependency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cgroup.h (therefore swap.h, therefore half of the universe)
includes bpf.h which in turn includes module.h and slab.h.
Since we're about to get rid of that dependency we need
to clean things up.

v2: drop the cpu.h include from cacheinfo.h, it's not necessary
and it makes riscv sensitive to ordering of include files.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Krzysztof Wilczyński <kw@linux.com>
Acked-by: Peter Chen <peter.chen@kernel.org>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/all/20211120035253.72074-1-kuba@kernel.org/  # v1
Link: https://lore.kernel.org/all/20211120165528.197359-1-kuba@kernel.org/ # cacheinfo discussion
Link: https://lore.kernel.org/bpf/20211202203400.1208663-1-kuba@kernel.org
---
 include/linux/cacheinfo.h     | 1 -
 include/linux/device/driver.h | 1 +
 include/linux/filter.h        | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 2f909ed084c6..4ff37cb763ae 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -3,7 +3,6 @@
 #define _LINUX_CACHEINFO_H
 
 #include <linux/bitops.h>
-#include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
 
diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index a498ebcf4993..15e7c5e15d62 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -18,6 +18,7 @@
 #include <linux/klist.h>
 #include <linux/pm.h>
 #include <linux/device/bus.h>
+#include <linux/module.h>
 
 /**
  * enum probe_type - device driver probe type to try
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 534f678ca50f..7f1e88e3e2b5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -6,6 +6,7 @@
 #define __LINUX_FILTER_H__
 
 #include <linux/atomic.h>
+#include <linux/bpf.h>
 #include <linux/refcount.h>
 #include <linux/compat.h>
 #include <linux/skbuff.h>
@@ -26,7 +27,6 @@
 
 #include <asm/byteorder.h>
 #include <uapi/linux/filter.h>
-#include <uapi/linux/bpf.h>
 
 struct sk_buff;
 struct sock;
-- 
cgit v1.2.3


From 4bdcd1dd4d2f973b1a89fb20ba720d879e9e506b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Oct 2021 08:47:05 -0600
Subject: mm: move filemap_range_needs_writeback() into header

No functional changes in this patch, just in preparation for efficiently
calling this light function from the block O_DIRECT handling.

Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fs.h      |  2 --
 include/linux/pagemap.h | 29 +++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbf812ce89a8..6b8dc1a78df6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2847,8 +2847,6 @@ static inline int filemap_fdatawait(struct address_space *mapping)
 
 extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
 				  loff_t lend);
-extern bool filemap_range_needs_writeback(struct address_space *,
-					  loff_t lstart, loff_t lend);
 extern int filemap_write_and_wait_range(struct address_space *mapping,
 				        loff_t lstart, loff_t lend);
 extern int __filemap_fdatawrite_range(struct address_space *mapping,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 605246452305..274a0710f2c5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -963,6 +963,35 @@ static inline int add_to_page_cache(struct page *page,
 int __filemap_add_folio(struct address_space *mapping, struct folio *folio,
 		pgoff_t index, gfp_t gfp, void **shadowp);
 
+bool filemap_range_has_writeback(struct address_space *mapping,
+				 loff_t start_byte, loff_t end_byte);
+
+/**
+ * filemap_range_needs_writeback - check if range potentially needs writeback
+ * @mapping:           address space within which to check
+ * @start_byte:        offset in bytes where the range starts
+ * @end_byte:          offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback. Used by O_DIRECT
+ * read/write with IOCB_NOWAIT, to see if the caller needs to do
+ * filemap_write_and_wait_range() before proceeding.
+ *
+ * Return: %true if the caller should do filemap_write_and_wait_range() before
+ * doing O_DIRECT to a page in this range, %false otherwise.
+ */
+static inline bool filemap_range_needs_writeback(struct address_space *mapping,
+						 loff_t start_byte,
+						 loff_t end_byte)
+{
+	if (!mapping->nrpages)
+		return false;
+	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+	    !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+		return false;
+	return filemap_range_has_writeback(mapping, start_byte, end_byte);
+}
+
 /**
  * struct readahead_control - Describes a readahead request.
  *
-- 
cgit v1.2.3


From 0a467d0fdd9594fbb449ebc93852533332c528fd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 14 Oct 2021 14:39:59 -0600
Subject: block: switch to atomic_t for request references

refcount_t is not as expensive as it used to be, but it's still more
expensive than the io_uring method of using atomic_t and just checking
for potential over/underflow.

This borrows that same implementation, which in turn is based on the
mm implementation from Linus.

Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1b87b7c8bbff..561beb5be7ec 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -139,7 +139,7 @@ struct request {
 	unsigned short ioprio;
 
 	enum mq_rq_state state;
-	refcount_t ref;
+	atomic_t ref;
 
 	unsigned long deadline;
 
-- 
cgit v1.2.3


From 704b914f15fb7daaf517e3acc4bed472b50ca19e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 3 Dec 2021 21:15:32 +0800
Subject: blk-mq: move srcu from blk_mq_hw_ctx to request_queue

In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch
critical area. However, this srcu instance stays at the end of hctx, and
it often takes standalone cacheline, often cold.

Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on
the indirect percpu variable which is allocated from heap instead of
being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It
doesn't matter if srcu structure stays in hctx or request queue.

So switch to per-request-queue srcu for protecting dispatch, and this
way simplifies quiesce a lot, not mention quiesce is always done on the
request queue wide.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211203131534.3668411-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 8 --------
 include/linux/blkdev.h | 9 +++++++++
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 561beb5be7ec..ecdc049b52fa 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -4,7 +4,6 @@
 
 #include <linux/blkdev.h>
 #include <linux/sbitmap.h>
-#include <linux/srcu.h>
 #include <linux/lockdep.h>
 #include <linux/scatterlist.h>
 #include <linux/prefetch.h>
@@ -375,13 +374,6 @@ struct blk_mq_hw_ctx {
 	 * q->unused_hctx_list.
 	 */
 	struct list_head	hctx_list;
-
-	/**
-	 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
-	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
-	 * blk_mq_hw_ctx_size().
-	 */
-	struct srcu_struct	srcu[];
 };
 
 /**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0a4416ef4fbf..c80cfaefc0a8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/blkzoned.h>
 #include <linux/sbitmap.h>
+#include <linux/srcu.h>
 
 struct module;
 struct request_queue;
@@ -373,11 +374,18 @@ struct request_queue {
 	 * devices that do not have multiple independent access ranges.
 	 */
 	struct blk_independent_access_ranges *ia_ranges;
+
+	/**
+	 * @srcu: Sleepable RCU. Use as lock when type of the request queue
+	 * is blocking (BLK_MQ_F_BLOCKING). Must be the last member
+	 */
+	struct srcu_struct	srcu[];
 };
 
 /* Keep blk_queue_flag_name[] in sync with the definitions below */
 #define QUEUE_FLAG_STOPPED	0	/* queue is stopped */
 #define QUEUE_FLAG_DYING	1	/* queue being torn down */
+#define QUEUE_FLAG_HAS_SRCU	2	/* SRCU is allocated */
 #define QUEUE_FLAG_NOMERGES     3	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	4	/* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO	5	/* fake timeout */
@@ -415,6 +423,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dying(q)	test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
+#define blk_queue_has_srcu(q)	test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags)
 #define blk_queue_dead(q)	test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
 #define blk_queue_init_done(q)	test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
-- 
cgit v1.2.3


From 0cc3a8017900f856f9bf4fdc41c2b5cb1670aabe Mon Sep 17 00:00:00 2001
From: Manish Chopra <manishc@marvell.com>
Date: Thu, 2 Dec 2021 13:01:56 -0800
Subject: qed*: enhance tx timeout debug info

This patch add some new qed APIs to query status block
info and report various data to MFW on tx timeout event

Along with that it enhances qede to dump more debug logs
(not just specific to the queue which was reported by stack)
on tx timeout which includes various other basic metadata about
all tx queues and other info (like status block etc.)

Signed-off-by: Manish Chopra <manishc@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: Alok Prasad <palok@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/qed/qed_if.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 0dae7fcc5ef2..9f4bfa2a4829 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -807,6 +807,12 @@ struct qed_devlink {
 	struct devlink_health_reporter *fw_reporter;
 };
 
+struct qed_sb_info_dbg {
+	u32 igu_prod;
+	u32 igu_cons;
+	u16 pi[PIS_PER_SB];
+};
+
 struct qed_common_cb_ops {
 	void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc);
 	void (*link_update)(void *dev, struct qed_link_output *link);
@@ -1194,6 +1200,11 @@ struct qed_common_ops {
 	struct devlink* (*devlink_register)(struct qed_dev *cdev);
 
 	void (*devlink_unregister)(struct devlink *devlink);
+
+	__printf(2, 3) void (*mfw_report)(struct qed_dev *cdev, char *fmt, ...);
+
+	int (*get_sb_info)(struct qed_dev *cdev, struct qed_sb_info *sb,
+			   u16 qid, struct qed_sb_info_dbg *sb_dbg);
 };
 
 #define MASK_FIELD(_name, _value) \
-- 
cgit v1.2.3


From 823163ba6e52e644be5df4539a19e3df8d0988dd Mon Sep 17 00:00:00 2001
From: Manish Chopra <manishc@marvell.com>
Date: Thu, 2 Dec 2021 13:01:57 -0800
Subject: qed*: esl priv flag support through ethtool

ESL(Enhanced System Lockdown) was designed to lock PCI adapter firmware
images and prevent changes to critical non-volatile configuration data
so that uncontrolled, malicious or unintentional modification to the
adapters are avoided, ensuring it's operational state. Once this feature is
enabled, the device is locked, rejecting any modification to non-volatile
images. Once unlocked, the protection is off such that firmware and
non-volatile configurations may be altered.

Driver just reflects the capability and status of this through
the ethtool private flag.

Signed-off-by: Manish Chopra <manishc@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: Alok Prasad <palok@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/qed/qed_if.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 9f4bfa2a4829..6dc4943d8aec 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -652,6 +652,7 @@ struct qed_dev_info {
 
 	bool wol_support;
 	bool smart_an;
+	bool esl;
 
 	/* MBI version */
 	u32 mbi_version;
@@ -1205,6 +1206,8 @@ struct qed_common_ops {
 
 	int (*get_sb_info)(struct qed_dev *cdev, struct qed_sb_info *sb,
 			   u16 qid, struct qed_sb_info_dbg *sb_dbg);
+
+	int (*get_esl_status)(struct qed_dev *cdev, bool *esl_active);
 };
 
 #define MASK_FIELD(_name, _value) \
-- 
cgit v1.2.3


From a3642021923b26d86bb27d88c826494827612c06 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 29 Nov 2021 18:46:47 +0100
Subject: locking/rtmutex: Add rt_mutex_lock_nest_lock() and
 rt_mutex_lock_killable().

The locking selftest for ww-mutex expects to operate directly on the
base-mutex which becomes a rtmutex on PREEMPT_RT.

Add a rtmutex based implementation of mutex_lock_nest_lock() and
mutex_lock_killable() named rt_mutex_lock_nest_lock() abd
rt_mutex_lock_killable().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20211129174654.668506-5-bigeasy@linutronix.de
---
 include/linux/rtmutex.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 9deedfeec2b1..7d049883a08a 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -99,13 +99,22 @@ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass);
+extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock);
 #define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0)
+#define rt_mutex_lock_nest_lock(lock, nest_lock)			\
+	do {								\
+		typecheck(struct lockdep_map *, &(nest_lock)->dep_map);	\
+		_rt_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);	\
+	} while (0)
+
 #else
 extern void rt_mutex_lock(struct rt_mutex *lock);
 #define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock)
+#define rt_mutex_lock_nest_lock(lock, nest_lock) rt_mutex_lock(lock)
 #endif
 
 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
+extern int rt_mutex_lock_killable(struct rt_mutex *lock);
 extern int rt_mutex_trylock(struct rt_mutex *lock);
 
 extern void rt_mutex_unlock(struct rt_mutex *lock);
-- 
cgit v1.2.3


From 0c1d7a2c2d32fac7ff4a644724b2d52a64184645 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 29 Nov 2021 18:46:48 +0100
Subject: lockdep: Remove softirq accounting on PREEMPT_RT.

There is not really a softirq context on PREEMPT_RT.  Softirqs on
PREEMPT_RT are always invoked within the context of a threaded
interrupt handler or within ksoftirqd. The "in-softirq" context is
preemptible and is protected by a per-CPU lock to ensure mutual
exclusion.

There is no difference on PREEMPT_RT between spin_lock_irq() and
spin_lock() because the former does not disable interrupts. Therefore
if a lock is used in_softirq() and locked once with spin_lock_irq()
then lockdep will report this with "inconsistent {SOFTIRQ-ON-W} ->
{IN-SOFTIRQ-W} usage".

Teach lockdep that we don't really do softirqs on -RT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20211129174654.668506-6-bigeasy@linutronix.de
---
 include/linux/irqflags.h | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 600c10da321a..4b140938b03e 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -71,14 +71,6 @@ do {						\
 do {						\
 	__this_cpu_dec(hardirq_context);	\
 } while (0)
-# define lockdep_softirq_enter()		\
-do {						\
-	current->softirq_context++;		\
-} while (0)
-# define lockdep_softirq_exit()			\
-do {						\
-	current->softirq_context--;		\
-} while (0)
 
 # define lockdep_hrtimer_enter(__hrtimer)		\
 ({							\
@@ -140,6 +132,21 @@ do {						\
 # define lockdep_irq_work_exit(__work)		do { } while (0)
 #endif
 
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT)
+# define lockdep_softirq_enter()		\
+do {						\
+	current->softirq_context++;		\
+} while (0)
+# define lockdep_softirq_exit()			\
+do {						\
+	current->softirq_context--;		\
+} while (0)
+
+#else
+# define lockdep_softirq_enter()		do { } while (0)
+# define lockdep_softirq_exit()			do { } while (0)
+#endif
+
 #if defined(CONFIG_IRQSOFF_TRACER) || \
 	defined(CONFIG_PREEMPT_TRACER)
  extern void stop_critical_timings(void);
-- 
cgit v1.2.3


From c0bed69daf4b67809b58cc7cd81a8fa4f45bc161 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 3 Dec 2021 15:59:34 +0800
Subject: locking: Make owner_on_cpu() into <linux/sched.h>

Move the owner_on_cpu() from kernel/locking/rwsem.c into
include/linux/sched.h with under CONFIG_SMP, then use it
in the mutex/rwsem/rtmutex to simplify the code.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20211203075935.136808-2-wangkefeng.wang@huawei.com
---
 include/linux/sched.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78c351e35fec..ff609d9c2f21 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2171,6 +2171,15 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_SMP
+static inline bool owner_on_cpu(struct task_struct *owner)
+{
+	/*
+	 * As lock holder preemption issue, we both skip spinning if
+	 * task is not on cpu or its cpu is preempted
+	 */
+	return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
+}
+
 /* Returns effective CPU energy utilization, as seen by the scheduler */
 unsigned long sched_cpu_util(int cpu, unsigned long max);
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 4cf75fd4a2545ca4deea992f929602c9fdbe8058 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 3 Dec 2021 15:59:35 +0800
Subject: locking: Mark racy reads of owner->on_cpu

One of the more frequent data races reported by KCSAN is the racy read
in mutex_spin_on_owner(), which is usually reported as "race of unknown
origin" without showing the writer. This is due to the racing write
occurring in kernel/sched. Locally enabling KCSAN in kernel/sched shows:

 | write (marked) to 0xffff97f205079934 of 4 bytes by task 316 on cpu 6:
 |  finish_task                kernel/sched/core.c:4632 [inline]
 |  finish_task_switch         kernel/sched/core.c:4848
 |  context_switch             kernel/sched/core.c:4975 [inline]
 |  __schedule                 kernel/sched/core.c:6253
 |  schedule                   kernel/sched/core.c:6326
 |  schedule_preempt_disabled  kernel/sched/core.c:6385
 |  __mutex_lock_common        kernel/locking/mutex.c:680
 |  __mutex_lock               kernel/locking/mutex.c:740 [inline]
 |  __mutex_lock_slowpath      kernel/locking/mutex.c:1028
 |  mutex_lock                 kernel/locking/mutex.c:283
 |  tty_open_by_driver         drivers/tty/tty_io.c:2062 [inline]
 |  ...
 |
 | read to 0xffff97f205079934 of 4 bytes by task 322 on cpu 3:
 |  mutex_spin_on_owner        kernel/locking/mutex.c:370
 |  mutex_optimistic_spin      kernel/locking/mutex.c:480
 |  __mutex_lock_common        kernel/locking/mutex.c:610
 |  __mutex_lock               kernel/locking/mutex.c:740 [inline]
 |  __mutex_lock_slowpath      kernel/locking/mutex.c:1028
 |  mutex_lock                 kernel/locking/mutex.c:283
 |  tty_open_by_driver         drivers/tty/tty_io.c:2062 [inline]
 |  ...
 |
 | value changed: 0x00000001 -> 0x00000000

This race is clearly intentional, and the potential for miscompilation
is slim due to surrounding barrier() and cpu_relax(), and the value
being used as a boolean.

Nevertheless, marking this reader would more clearly denote intent and
make it obvious that concurrency is expected. Use READ_ONCE() to avoid
having to reason about compiler optimizations now and in future.

With previous refactor, mark the read to owner->on_cpu in owner_on_cpu(),
which immediately precedes the loop executing mutex_spin_on_owner().

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20211203075935.136808-3-wangkefeng.wang@huawei.com
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ff609d9c2f21..0b9b0e3f4791 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2177,7 +2177,7 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 	 * As lock holder preemption issue, we both skip spinning if
 	 * task is not on cpu or its cpu is preempted
 	 */
-	return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
+	return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner));
 }
 
 /* Returns effective CPU energy utilization, as seen by the scheduler */
-- 
cgit v1.2.3


From fb08a1908cb119a4585611d91461ab6d27756b14 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 29 Nov 2021 11:21:38 +0100
Subject: dax: simplify the dax_device <-> gendisk association

Replace the dax_host_hash with an xarray indexed by the pointer value
of the gendisk, and require explicitly calls from the block drivers that
want to associate their gendisk with a dax_device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20211129102203.2243509-5-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 8623caa67388..e2e9a67004cb 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -11,9 +11,11 @@
 
 typedef unsigned long dax_entry_t;
 
+struct dax_device;
+struct gendisk;
 struct iomap_ops;
 struct iomap;
-struct dax_device;
+
 struct dax_operations {
 	/*
 	 * direct_access: translate a device-relative
@@ -39,8 +41,8 @@ struct dax_operations {
 };
 
 #if IS_ENABLED(CONFIG_DAX)
-struct dax_device *alloc_dax(void *private, const char *host,
-		const struct dax_operations *ops, unsigned long flags);
+struct dax_device *alloc_dax(void *private, const struct dax_operations *ops,
+		unsigned long flags);
 void put_dax(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
@@ -68,7 +70,7 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
 	return dax_synchronous(dax_dev);
 }
 #else
-static inline struct dax_device *alloc_dax(void *private, const char *host,
+static inline struct dax_device *alloc_dax(void *private,
 		const struct dax_operations *ops, unsigned long flags)
 {
 	/*
@@ -107,6 +109,8 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
 struct writeback_control;
 int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #if IS_ENABLED(CONFIG_FS_DAX)
+int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
+void dax_remove_host(struct gendisk *disk);
 bool generic_fsdax_supported(struct dax_device *dax_dev,
 		struct block_device *bdev, int blocksize, sector_t start,
 		sector_t sectors);
@@ -128,6 +132,13 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t st
 dax_entry_t dax_lock_page(struct page *page);
 void dax_unlock_page(struct page *page, dax_entry_t cookie);
 #else
+static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
+{
+	return 0;
+}
+static inline void dax_remove_host(struct gendisk *disk)
+{
+}
 #define generic_fsdax_supported		NULL
 
 static inline bool dax_supported(struct dax_device *dax_dev,
-- 
cgit v1.2.3


From 7b0800d00dae8c897398abaf61e82db0d67d7afc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 29 Nov 2021 11:21:42 +0100
Subject: dax: remove dax_capable

Just open code the block size and dax_dev == NULL checks in the callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com> [erofs]
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20211129102203.2243509-9-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index e2e9a67004cb..439c3c70e347 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -111,12 +111,6 @@ int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #if IS_ENABLED(CONFIG_FS_DAX)
 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
 void dax_remove_host(struct gendisk *disk);
-bool generic_fsdax_supported(struct dax_device *dax_dev,
-		struct block_device *bdev, int blocksize, sector_t start,
-		sector_t sectors);
-
-bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
-		int blocksize, sector_t start, sector_t len);
 
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
@@ -139,14 +133,6 @@ static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
 static inline void dax_remove_host(struct gendisk *disk)
 {
 }
-#define generic_fsdax_supported		NULL
-
-static inline bool dax_supported(struct dax_device *dax_dev,
-		struct block_device *bdev, int blocksize, sector_t start,
-		sector_t len)
-{
-	return false;
-}
 
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
-- 
cgit v1.2.3


From 60696eb26a37ab0199f7833ddbc1b75138c36d16 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 29 Nov 2021 11:21:48 +0100
Subject: fsdax: simplify the pgoff calculation

Replace the two steps of dax_iomap_sector and bdev_dax_pgoff with a
single dax_iomap_pgoff helper that avoids lots of cumbersome sector
conversions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20211129102203.2243509-15-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 439c3c70e347..324363b798ec 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -107,7 +107,6 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
 #endif
 
 struct writeback_control;
-int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #if IS_ENABLED(CONFIG_FS_DAX)
 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
 void dax_remove_host(struct gendisk *disk);
-- 
cgit v1.2.3


From c6f40468657d16e4010ef84bf32a761feb3469ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 29 Nov 2021 11:21:52 +0100
Subject: fsdax: decouple zeroing from the iomap buffered I/O code

Unshare the DAX and iomap buffered I/O page zeroing code.  This code
previously did a IS_DAX check deep inside the iomap code, which in
fact was the only DAX check in the code.  Instead move these checks
into the callers.  Most callers already have DAX special casing anyway
and XFS will need it for reflink support as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20211129102203.2243509-19-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 324363b798ec..b79036743e7f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -14,6 +14,7 @@ typedef unsigned long dax_entry_t;
 struct dax_device;
 struct gendisk;
 struct iomap_ops;
+struct iomap_iter;
 struct iomap;
 
 struct dax_operations {
@@ -170,6 +171,11 @@ static inline void dax_unlock_page(struct page *page, dax_entry_t cookie)
 }
 #endif
 
+int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+		const struct iomap_ops *ops);
+int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+		const struct iomap_ops *ops);
+
 #if IS_ENABLED(CONFIG_DAX)
 int dax_read_lock(void);
 void dax_read_unlock(int id);
@@ -204,7 +210,6 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
-s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
 static inline bool dax_mapping(struct address_space *mapping)
 {
 	return mapping->host && IS_DAX(mapping->host);
-- 
cgit v1.2.3


From 952da06375c8f3aa58474fff718d9ae8442531b9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 29 Nov 2021 11:21:58 +0100
Subject: iomap: add a IOMAP_DAX flag

Add a flag so that the file system can easily detect DAX operations
based just on the iomap operation requested instead of looking at
inode state using IS_DAX.  This will be needed to apply the to be
added partition offset only for operations that actually use DAX,
but not things like fiemap that are based on the block device.
In the long run it should also allow turning the bdev, dax_dev
and inline_data into a union.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20211129102203.2243509-25-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/iomap.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6d1b08d0ae93..5b9432f9f79e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -141,6 +141,11 @@ struct iomap_page_ops {
 #define IOMAP_NOWAIT		(1 << 5) /* do not block */
 #define IOMAP_OVERWRITE_ONLY	(1 << 6) /* only pure overwrites allowed */
 #define IOMAP_UNSHARE		(1 << 7) /* unshare_file_range */
+#ifdef CONFIG_FS_DAX
+#define IOMAP_DAX		(1 << 8) /* DAX mapping */
+#else
+#define IOMAP_DAX		0
+#endif /* CONFIG_FS_DAX */
 
 struct iomap_ops {
 	/*
-- 
cgit v1.2.3


From cd913c76f489def1a388e3a5b10df94948ede3f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 29 Nov 2021 11:21:59 +0100
Subject: dax: return the partition offset from fs_dax_get_by_bdev

Prepare for the removal of the block_device from the DAX I/O path by
returning the partition offset from fs_dax_get_by_bdev so that the file
systems have it at hand for use during I/O.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20211129102203.2243509-26-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index b79036743e7f..f6f353382cc9 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -117,7 +117,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
 	put_dax(dax_dev);
 }
 
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
+		u64 *start_off);
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct dax_device *dax_dev, struct writeback_control *wbc);
 
@@ -138,7 +139,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
 {
 }
 
-static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
+		u64 *start_off)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 2ede892342b3c628991ff1b9060108a7edd92d94 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 29 Nov 2021 11:22:01 +0100
Subject: dax: fix up some of the block device related ifdefs

The DAX device <-> block device association is only enabled if
CONFIG_BLOCK is enabled.  Update dax.h to account for that and use
the right conditions for the fs_put_dax stub as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20211129102203.2243509-28-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index f6f353382cc9..87ae4c9b1d65 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -108,24 +108,15 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
 #endif
 
 struct writeback_control;
-#if IS_ENABLED(CONFIG_FS_DAX)
+#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
 void dax_remove_host(struct gendisk *disk);
-
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
+		u64 *start_off);
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
 	put_dax(dax_dev);
 }
-
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
-		u64 *start_off);
-int dax_writeback_mapping_range(struct address_space *mapping,
-		struct dax_device *dax_dev, struct writeback_control *wbc);
-
-struct page *dax_layout_busy_page(struct address_space *mapping);
-struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
 #else
 static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
 {
@@ -134,17 +125,25 @@ static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
 static inline void dax_remove_host(struct gendisk *disk)
 {
 }
-
-static inline void fs_put_dax(struct dax_device *dax_dev)
-{
-}
-
 static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
 		u64 *start_off)
 {
 	return NULL;
 }
+static inline void fs_put_dax(struct dax_device *dax_dev)
+{
+}
+#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
 
+#if IS_ENABLED(CONFIG_FS_DAX)
+int dax_writeback_mapping_range(struct address_space *mapping,
+		struct dax_device *dax_dev, struct writeback_control *wbc);
+
+struct page *dax_layout_busy_page(struct address_space *mapping);
+struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
+dax_entry_t dax_lock_page(struct page *page);
+void dax_unlock_page(struct page *page, dax_entry_t cookie);
+#else
 static inline struct page *dax_layout_busy_page(struct address_space *mapping)
 {
 	return NULL;
-- 
cgit v1.2.3


From 866de407444398bc8140ea70de1dba5f91cc34ac Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 3 Dec 2021 13:30:01 +0800
Subject: bpf: Disallow BPF_LOG_KERNEL log level for bpf(BPF_BTF_LOAD)

BPF_LOG_KERNEL is only used internally, so disallow bpf_btf_load()
to set log level as BPF_LOG_KERNEL. The same checking has already
been done in bpf_check(), so factor out a helper to check the
validity of log attributes and use it in both places.

Fixes: 8580ac9404f6 ("bpf: Process in-kernel BTF")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20211203053001.740945-1-houtao1@huawei.com
---
 include/linux/bpf_verifier.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c8a78e830fca..182b16a91084 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -396,6 +396,13 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 		 log->level == BPF_LOG_KERNEL);
 }
 
+static inline bool
+bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+{
+	return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 &&
+	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
+}
+
 #define BPF_MAX_SUBPROGS 256
 
 struct bpf_subprog_info {
-- 
cgit v1.2.3


From b80892ca022e9eb484771a66eb68e12364695a2a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 Oct 2021 17:10:17 +0200
Subject: memremap: remove support for external pgmap refcounts

No driver is left using the external pgmap refcount, so remove the
code to support it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20211028151017.50234-1-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/memremap.h | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c0e9d35889e8..a8bc588fe7aa 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -72,16 +72,6 @@ struct dev_pagemap_ops {
 	 */
 	void (*page_free)(struct page *page);
 
-	/*
-	 * Transition the refcount in struct dev_pagemap to the dead state.
-	 */
-	void (*kill)(struct dev_pagemap *pgmap);
-
-	/*
-	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
-	 */
-	void (*cleanup)(struct dev_pagemap *pgmap);
-
 	/*
 	 * Used for private (un-addressable) device memory only.  Must migrate
 	 * the page back to a CPU accessible page.
@@ -95,8 +85,7 @@ struct dev_pagemap_ops {
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @ref: reference count that pins the devm_memremap_pages() mapping
- * @internal_ref: internal reference if @ref is not provided by the caller
- * @done: completion for @internal_ref
+ * @done: completion for @ref
  * @type: memory type: see MEMORY_* in memory_hotplug.h
  * @flags: PGMAP_* flags to specify defailed behavior
  * @ops: method table
@@ -109,8 +98,7 @@ struct dev_pagemap_ops {
  */
 struct dev_pagemap {
 	struct vmem_altmap altmap;
-	struct percpu_ref *ref;
-	struct percpu_ref internal_ref;
+	struct percpu_ref ref;
 	struct completion done;
 	enum memory_type type;
 	unsigned int flags;
@@ -191,7 +179,7 @@ static inline unsigned long memremap_compat_align(void)
 static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
 {
 	if (pgmap)
-		percpu_ref_put(pgmap->ref);
+		percpu_ref_put(&pgmap->ref);
 }
 
 #endif /* _LINUX_MEMREMAP_H_ */
-- 
cgit v1.2.3


From 02e4079913500f24ceb082d8d87d8665f044b298 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 3 Dec 2021 12:17:04 +0100
Subject: fs: remove unused low-level mapping helpers

Now that we ported all places to use the new low-level mapping helpers
that are able to support filesystems mounted with an idmapping we can
remove the old low-level mapping helpers. With the removal of these old
helpers we also conclude the renaming of the mapping helpers we started
in commit a65e58e791a1 ("fs: document and rename fsid helpers").

Link: https://lore.kernel.org/r/20211123114227.3124056-8-brauner@kernel.org (v1)
Link: https://lore.kernel.org/r/20211130121032.3753852-8-brauner@kernel.org (v2)
Link: https://lore.kernel.org/r/20211203111707.3901969-8-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/mnt_idmapping.h | 56 -------------------------------------------
 1 file changed, 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index 60341cd33ccc..0c6ab3f4c952 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -13,62 +13,6 @@ struct user_namespace;
  */
 extern struct user_namespace init_user_ns;
 
-/**
- * kuid_into_mnt - map a kuid down into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kuid: kuid to be mapped
- *
- * Return: @kuid mapped according to @mnt_userns.
- * If @kuid has no mapping INVALID_UID is returned.
- */
-static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns,
-				   kuid_t kuid)
-{
-	return make_kuid(mnt_userns, __kuid_val(kuid));
-}
-
-/**
- * kgid_into_mnt - map a kgid down into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kgid: kgid to be mapped
- *
- * Return: @kgid mapped according to @mnt_userns.
- * If @kgid has no mapping INVALID_GID is returned.
- */
-static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns,
-				   kgid_t kgid)
-{
-	return make_kgid(mnt_userns, __kgid_val(kgid));
-}
-
-/**
- * kuid_from_mnt - map a kuid up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kuid: kuid to be mapped
- *
- * Return: @kuid mapped up according to @mnt_userns.
- * If @kuid has no mapping INVALID_UID is returned.
- */
-static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns,
-				   kuid_t kuid)
-{
-	return KUIDT_INIT(from_kuid(mnt_userns, kuid));
-}
-
-/**
- * kgid_from_mnt - map a kgid up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kgid: kgid to be mapped
- *
- * Return: @kgid mapped up according to @mnt_userns.
- * If @kgid has no mapping INVALID_GID is returned.
- */
-static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
-				   kgid_t kgid)
-{
-	return KGIDT_INIT(from_kgid(mnt_userns, kgid));
-}
-
 /**
  * initial_idmapping - check whether this is the initial mapping
  * @ns: idmapping to check
-- 
cgit v1.2.3


From 209188ce75d0d357c292f6bb81d712acdd4e7db7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 3 Dec 2021 12:17:05 +0100
Subject: fs: port higher-level mapping helpers

Enable the mapped_fs{g,u}id() helpers to support filesystems mounted
with an idmapping. Apart from core mapping helpers that use
mapped_fs{g,u}id() to initialize struct inode's i_{g,u}id fields xfs is
the only place that uses these low-level helpers directly.

The patch only extends the helpers to be able to take the filesystem
idmapping into account. Since we don't actually yet pass the
filesystem's idmapping in no functional changes happen. This will happen
in a final patch.

Link: https://lore.kernel.org/r/20211123114227.3124056-9-brauner@kernel.org (v1)
Link: https://lore.kernel.org/r/20211130121032.3753852-9-brauner@kernel.org (v2)
Link: https://lore.kernel.org/r/20211203111707.3901969-9-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h            |  8 ++++----
 include/linux/mnt_idmapping.h | 12 ++++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 57aee6ebba72..e1f28f757f1b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1664,7 +1664,7 @@ static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
 static inline void inode_fsuid_set(struct inode *inode,
 				   struct user_namespace *mnt_userns)
 {
-	inode->i_uid = mapped_fsuid(mnt_userns);
+	inode->i_uid = mapped_fsuid(mnt_userns, &init_user_ns);
 }
 
 /**
@@ -1678,7 +1678,7 @@ static inline void inode_fsuid_set(struct inode *inode,
 static inline void inode_fsgid_set(struct inode *inode,
 				   struct user_namespace *mnt_userns)
 {
-	inode->i_gid = mapped_fsgid(mnt_userns);
+	inode->i_gid = mapped_fsgid(mnt_userns, &init_user_ns);
 }
 
 /**
@@ -1699,10 +1699,10 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
 	kuid_t kuid;
 	kgid_t kgid;
 
-	kuid = mapped_fsuid(mnt_userns);
+	kuid = mapped_fsuid(mnt_userns, &init_user_ns);
 	if (!uid_valid(kuid))
 		return false;
-	kgid = mapped_fsgid(mnt_userns);
+	kgid = mapped_fsgid(mnt_userns, &init_user_ns);
 	if (!gid_valid(kgid))
 		return false;
 	return kuid_has_mapping(fs_userns, kuid) &&
diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index 0c6ab3f4c952..ee5a217de2a8 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -196,6 +196,7 @@ static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns,
 /**
  * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns
  * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
  *
  * Use this helper to initialize a new vfs or filesystem object based on
  * the caller's fsuid. A common example is initializing the i_uid field of
@@ -205,14 +206,16 @@ static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns,
  *
  * Return: the caller's current fsuid mapped up according to @mnt_userns.
  */
-static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns)
+static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns,
+				  struct user_namespace *fs_userns)
 {
-	return mapped_kuid_user(mnt_userns, &init_user_ns, current_fsuid());
+	return mapped_kuid_user(mnt_userns, fs_userns, current_fsuid());
 }
 
 /**
  * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns
  * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
  *
  * Use this helper to initialize a new vfs or filesystem object based on
  * the caller's fsgid. A common example is initializing the i_gid field of
@@ -222,9 +225,10 @@ static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns)
  *
  * Return: the caller's current fsgid mapped up according to @mnt_userns.
  */
-static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns)
+static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns,
+				  struct user_namespace *fs_userns)
 {
-	return mapped_kgid_user(mnt_userns, &init_user_ns, current_fsgid());
+	return mapped_kgid_user(mnt_userns, fs_userns, current_fsgid());
 }
 
 #endif /* _LINUX_MNT_IDMAPPING_H */
-- 
cgit v1.2.3


From a1ec9040a2a9122605ac26e5725c6de019184419 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 3 Dec 2021 12:17:06 +0100
Subject: fs: add i_user_ns() helper

Since we'll be passing the filesystem's idmapping in even more places in
the following patches and we do already dereference struct inode to get
to the filesystem's idmapping multiple times add a tiny helper.

Link: https://lore.kernel.org/r/20211123114227.3124056-10-brauner@kernel.org (v1)
Link: https://lore.kernel.org/r/20211130121032.3753852-10-brauner@kernel.org (v2)
Link: https://lore.kernel.org/r/20211203111707.3901969-10-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e1f28f757f1b..3d6d514943ab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1600,6 +1600,11 @@ struct super_block {
 	struct list_head	s_inodes_wb;	/* writeback inodes */
 } __randomize_layout;
 
+static inline struct user_namespace *i_user_ns(const struct inode *inode)
+{
+	return inode->i_sb->s_user_ns;
+}
+
 /* Helper functions so that in most cases filesystems will
  * not need to deal directly with kuid_t and kgid_t and can
  * instead deal with the raw numeric values that are stored
@@ -1607,22 +1612,22 @@ struct super_block {
  */
 static inline uid_t i_uid_read(const struct inode *inode)
 {
-	return from_kuid(inode->i_sb->s_user_ns, inode->i_uid);
+	return from_kuid(i_user_ns(inode), inode->i_uid);
 }
 
 static inline gid_t i_gid_read(const struct inode *inode)
 {
-	return from_kgid(inode->i_sb->s_user_ns, inode->i_gid);
+	return from_kgid(i_user_ns(inode), inode->i_gid);
 }
 
 static inline void i_uid_write(struct inode *inode, uid_t uid)
 {
-	inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid);
+	inode->i_uid = make_kuid(i_user_ns(inode), uid);
 }
 
 static inline void i_gid_write(struct inode *inode, gid_t gid)
 {
-	inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
+	inode->i_gid = make_kgid(i_user_ns(inode), gid);
 }
 
 /**
-- 
cgit v1.2.3


From bd303368b776eead1c29e6cdda82bde7128b82a7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 3 Dec 2021 12:17:07 +0100
Subject: fs: support mapped mounts of mapped filesystems

In previous patches we added new and modified existing helpers to handle
idmapped mounts of filesystems mounted with an idmapping. In this final
patch we convert all relevant places in the vfs to actually pass the
filesystem's idmapping into these helpers.

With this the vfs is in shape to handle idmapped mounts of filesystems
mounted with an idmapping. Note that this is just the generic
infrastructure. Actually adding support for idmapped mounts to a
filesystem mountable with an idmapping is follow-up work.

In this patch we extend the definition of an idmapped mount from a mount
that that has the initial idmapping attached to it to a mount that has
an idmapping attached to it which is not the same as the idmapping the
filesystem was mounted with.

As before we do not allow the initial idmapping to be attached to a
mount. In addition this patch prevents that the idmapping the filesystem
was mounted with can be attached to a mount created based on this
filesystem.

This has multiple reasons and advantages. First, attaching the initial
idmapping or the filesystem's idmapping doesn't make much sense as in
both cases the values of the i_{g,u}id and other places where k{g,u}ids
are used do not change. Second, a user that really wants to do this for
whatever reason can just create a separate dedicated identical idmapping
to attach to the mount. Third, we can continue to use the initial
idmapping as an indicator that a mount is not idmapped allowing us to
continue to keep passing the initial idmapping into the mapping helpers
to tell them that something isn't an idmapped mount even if the
filesystem is mounted with an idmapping.

Link: https://lore.kernel.org/r/20211123114227.3124056-11-brauner@kernel.org (v1)
Link: https://lore.kernel.org/r/20211130121032.3753852-11-brauner@kernel.org (v2)
Link: https://lore.kernel.org/r/20211203111707.3901969-11-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3d6d514943ab..493b87e3616b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1641,7 +1641,7 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
 static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
 				    const struct inode *inode)
 {
-	return mapped_kuid_fs(mnt_userns, &init_user_ns, inode->i_uid);
+	return mapped_kuid_fs(mnt_userns, i_user_ns(inode), inode->i_uid);
 }
 
 /**
@@ -1655,7 +1655,7 @@ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
 static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
 				    const struct inode *inode)
 {
-	return mapped_kgid_fs(mnt_userns, &init_user_ns, inode->i_gid);
+	return mapped_kgid_fs(mnt_userns, i_user_ns(inode), inode->i_gid);
 }
 
 /**
@@ -1669,7 +1669,7 @@ static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
 static inline void inode_fsuid_set(struct inode *inode,
 				   struct user_namespace *mnt_userns)
 {
-	inode->i_uid = mapped_fsuid(mnt_userns, &init_user_ns);
+	inode->i_uid = mapped_fsuid(mnt_userns, i_user_ns(inode));
 }
 
 /**
@@ -1683,7 +1683,7 @@ static inline void inode_fsuid_set(struct inode *inode,
 static inline void inode_fsgid_set(struct inode *inode,
 				   struct user_namespace *mnt_userns)
 {
-	inode->i_gid = mapped_fsgid(mnt_userns, &init_user_ns);
+	inode->i_gid = mapped_fsgid(mnt_userns, i_user_ns(inode));
 }
 
 /**
@@ -1704,10 +1704,10 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
 	kuid_t kuid;
 	kgid_t kgid;
 
-	kuid = mapped_fsuid(mnt_userns, &init_user_ns);
+	kuid = mapped_fsuid(mnt_userns, fs_userns);
 	if (!uid_valid(kuid))
 		return false;
-	kgid = mapped_fsgid(mnt_userns, &init_user_ns);
+	kgid = mapped_fsgid(mnt_userns, fs_userns);
 	if (!gid_valid(kgid))
 		return false;
 	return kuid_has_mapping(fs_userns, kuid) &&
@@ -2653,13 +2653,14 @@ static inline struct user_namespace *file_mnt_user_ns(struct file *file)
  * is_idmapped_mnt - check whether a mount is mapped
  * @mnt: the mount to check
  *
- * If @mnt has an idmapping attached to it @mnt is mapped.
+ * If @mnt has an idmapping attached different from the
+ * filesystem's idmapping then @mnt is mapped.
  *
  * Return: true if mount is mapped, false if not.
  */
 static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
 {
-	return mnt_user_ns(mnt) != &init_user_ns;
+	return mnt_user_ns(mnt) != mnt->mnt_sb->s_user_ns;
 }
 
 extern long vfs_truncate(const struct path *, loff_t);
-- 
cgit v1.2.3


From 019cd8a9e3bcbbf6bac8036a9ae545f7858e0c08 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 2 Nov 2021 16:02:01 -0600
Subject: ARM: ixp4xx: remove unused header file pata_ixp4xx_cf.h

Commit b00ced38e317 ("ARM: ixp4xx: Delete Avila boardfiles") removed the
last use of <linux/platform_data/pata_ixp4xx_cf.h> but left the header file
in place.  Nothing uses this file, delete it now.

Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/platform_data/pata_ixp4xx_cf.h | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 include/linux/platform_data/pata_ixp4xx_cf.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/pata_ixp4xx_cf.h b/include/linux/platform_data/pata_ixp4xx_cf.h
deleted file mode 100644
index e60fa41da4a5..000000000000
--- a/include/linux/platform_data/pata_ixp4xx_cf.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __PLATFORM_DATA_PATA_IXP4XX_H
-#define __PLATFORM_DATA_PATA_IXP4XX_H
-
-#include <linux/types.h>
-
-/*
- * This structure provide a means for the board setup code
- * to give information to th pata_ixp4xx driver. It is
- * passed as platform_data.
- */
-struct ixp4xx_pata_data {
-	volatile u32	*cs0_cfg;
-	volatile u32	*cs1_cfg;
-	unsigned long	cs0_bits;
-	unsigned long	cs1_bits;
-	void __iomem	*cmd;
-	void __iomem	*ctl;
-};
-
-#endif
-- 
cgit v1.2.3


From 52f982f00b220d097a71a23c149a1d18efc08e63 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Mon, 6 Dec 2021 14:24:06 +0100
Subject: security,selinux: remove security_add_mnt_opt()

Its last user has been removed in commit f2aedb713c28 ("NFS: Add
fs_context support.").

Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 --
 include/linux/lsm_hooks.h     | 2 --
 include/linux/security.h      | 8 --------
 3 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ae2228f0711d..a5a724c308d8 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -78,8 +78,6 @@ LSM_HOOK(int, 0, sb_set_mnt_opts, struct super_block *sb, void *mnt_opts,
 LSM_HOOK(int, 0, sb_clone_mnt_opts, const struct super_block *oldsb,
 	 struct super_block *newsb, unsigned long kern_flags,
 	 unsigned long *set_kern_flags)
-LSM_HOOK(int, 0, sb_add_mnt_opt, const char *option, const char *val,
-	 int len, void **mnt_opts)
 LSM_HOOK(int, 0, move_mount, const struct path *from_path,
 	 const struct path *to_path)
 LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry,
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 52c1990644b9..3bf5c658bc44 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -180,8 +180,6 @@
  *	Copy all security options from a given superblock to another
  *	@oldsb old superblock which contain information to clone
  *	@newsb new superblock which needs filled in
- * @sb_add_mnt_opt:
- * 	Add one mount @option to @mnt_opts.
  * @sb_parse_opts_str:
  *	Parse a string of security data filling in the opts structure
  *	@options string containing all mount options known by the LSM
diff --git a/include/linux/security.h b/include/linux/security.h
index bb301963e333..6d72772182c8 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -313,8 +313,6 @@ int security_sb_clone_mnt_opts(const struct super_block *oldsb,
 				struct super_block *newsb,
 				unsigned long kern_flags,
 				unsigned long *set_kern_flags);
-int security_add_mnt_opt(const char *option, const char *val,
-				int len, void **mnt_opts);
 int security_move_mount(const struct path *from_path, const struct path *to_path);
 int security_dentry_init_security(struct dentry *dentry, int mode,
 				  const struct qstr *name,
@@ -711,12 +709,6 @@ static inline int security_sb_clone_mnt_opts(const struct super_block *oldsb,
 	return 0;
 }
 
-static inline int security_add_mnt_opt(const char *option, const char *val,
-					int len, void **mnt_opts)
-{
-	return 0;
-}
-
 static inline int security_move_mount(const struct path *from_path,
 				      const struct path *to_path)
 {
-- 
cgit v1.2.3


From 8ab30a331946c34e4ba022c44df8624acea1c74e Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Mon, 6 Dec 2021 20:49:48 +0800
Subject: blk-mq: Drop busy_iter_fn blk_mq_hw_ctx argument

The only user of blk_mq_hw_ctx blk_mq_hw_ctx argument is
blk_mq_rq_inflight().

Function blk_mq_rq_inflight() uses the hctx to find the associated request
queue to match against the request. However this same check is already
done in caller bt_iter(), so drop this check.

With that change there are no more users of busy_iter_fn blk_mq_hw_ctx
argument, so drop the argument.

Reviewed-by Hannes Reinecke <hare@suse.de>

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Kashyap Desai <kashyap.desai@broadcom.com>
Link: https://lore.kernel.org/r/1638794990-137490-2-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ecdc049b52fa..17ebf29e42d8 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -470,8 +470,7 @@ struct blk_mq_queue_data {
 	bool last;
 };
 
-typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
-		bool);
+typedef bool (busy_iter_fn)(struct request *, void *, bool);
 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
 
 /**
-- 
cgit v1.2.3


From fc39f8d2d1c10ac04976b0a247865bb0cec4dd88 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Mon, 6 Dec 2021 20:49:49 +0800
Subject: blk-mq: Delete busy_iter_fn

Typedefs busy_iter_fn and busy_tag_iter_fn are now identical, so delete
busy_iter_fn to reduce duplication.

It would be nicer to delete busy_tag_iter_fn, as the name busy_iter_fn is
less specific.

However busy_tag_iter_fn is used in many different parts of the tree,
unlike busy_iter_fn which is just use in block/, so just take the
straightforward path now, so that we could rename later treewide.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Tested-by: Kashyap Desai <kashyap.desai@broadcom.com>
Link: https://lore.kernel.org/r/1638794990-137490-3-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 17ebf29e42d8..772f8f921526 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -470,7 +470,6 @@ struct blk_mq_queue_data {
 	bool last;
 };
 
-typedef bool (busy_iter_fn)(struct request *, void *, bool);
 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
 
 /**
-- 
cgit v1.2.3


From 05770dd0ad110854c7157d95700d7c89979cdb3e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 22 Nov 2021 18:30:12 +0900
Subject: tracing: Support __rel_loc relative dynamic data location attribute

Add '__rel_loc' new dynamic data location attribute which encodes
the data location from the next to the field itself.

The '__data_loc' is used for encoding the dynamic data location on
the trace event record. But '__data_loc' is not useful if the writer
doesn't know the event header (e.g. user event), because it records
the dynamic data offset from the entry of the record, not the field
itself.

This new '__rel_loc' attribute encodes the data location relatively
from the next of the field. For example, when there is a record like
below (the number in the parentheses is the size of fields)

 |header(N)|common(M)|fields(K)|__data_loc(4)|fields(L)|data(G)|

In this case, '__data_loc' field will be

 __data_loc = (G << 16) | (N+M+K+4+L)

If '__rel_loc' is used, this will be

 |header(N)|common(M)|fields(K)|__rel_loc(4)|fields(L)|data(G)|

where

 __rel_loc = (G << 16) | (L)

This case shows L bytes after the '__rel_loc' attribute  field,
if there is no fields after the __rel_loc field, L must be 0.

This is relatively easy (and no need to consider the kernel header
change) when the event data fields are composed by user who doesn't
know header and common fields.

Link: https://lkml.kernel.org/r/163757341258.510314.4214431827833229956.stgit@devnote2

Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2d167ac3452c..3900404aa063 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -782,6 +782,7 @@ enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
 	FILTER_DYN_STRING,
+	FILTER_RDYN_STRING,
 	FILTER_PTR_STRING,
 	FILTER_TRACE_FN,
 	FILTER_COMM,
-- 
cgit v1.2.3


From 8c33915d77a565b8b5d44e6368e22b6ea300b7a8 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 28 Nov 2021 20:00:29 +0100
Subject: platform/x86: wmi: Add no_notify_data flag to struct wmi_driver

Some WMI implementations do notifies on WMI objects without a _WED method
allow WMI drivers to indicate that _WED should not be called for notifies
on the WMI objects the driver is bound to.

Instead the driver's notify callback will simply be called with a NULL
data argument.

Reported-by: Yauhen Kharuzhy <jekhor@gmail.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20211128190031.405620-3-hdegoede@redhat.com
---
 include/linux/wmi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 2cb3913c1f50..b88d7b58e61e 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -35,6 +35,7 @@ extern int set_required_buffer_size(struct wmi_device *wdev, u64 length);
 struct wmi_driver {
 	struct device_driver driver;
 	const struct wmi_device_id *id_table;
+	bool no_notify_data;
 
 	int (*probe)(struct wmi_device *wdev, const void *context);
 	void (*remove)(struct wmi_device *wdev);
-- 
cgit v1.2.3


From 4e66934eaadc83b27ada8d42b60894018f3bfabf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 4 Dec 2021 20:21:55 -0800
Subject: lib: add reference counting tracking infrastructure

It can be hard to track where references are taken and released.

In networking, we have annoying issues at device or netns dismantles,
and we had various proposals to ease root causing them.

This patch adds new infrastructure pairing refcount increases
and decreases. This will self document code, because programmers
will have to associate increments/decrements.

This is controled by CONFIG_REF_TRACKER which can be selected
by users of this feature.

This adds both cpu and memory costs, and thus should probably be
used with care.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ref_tracker.h | 73 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 include/linux/ref_tracker.h

(limited to 'include/linux')

diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h
new file mode 100644
index 000000000000..c11c9db5825c
--- /dev/null
+++ b/include/linux/ref_tracker.h
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#ifndef _LINUX_REF_TRACKER_H
+#define _LINUX_REF_TRACKER_H
+#include <linux/refcount.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+
+struct ref_tracker;
+
+struct ref_tracker_dir {
+#ifdef CONFIG_REF_TRACKER
+	spinlock_t		lock;
+	unsigned int		quarantine_avail;
+	refcount_t		untracked;
+	struct list_head	list; /* List of active trackers */
+	struct list_head	quarantine; /* List of dead trackers */
+#endif
+};
+
+#ifdef CONFIG_REF_TRACKER
+static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
+					unsigned int quarantine_count)
+{
+	INIT_LIST_HEAD(&dir->list);
+	INIT_LIST_HEAD(&dir->quarantine);
+	spin_lock_init(&dir->lock);
+	dir->quarantine_avail = quarantine_count;
+	refcount_set(&dir->untracked, 1);
+}
+
+void ref_tracker_dir_exit(struct ref_tracker_dir *dir);
+
+void ref_tracker_dir_print(struct ref_tracker_dir *dir,
+			   unsigned int display_limit);
+
+int ref_tracker_alloc(struct ref_tracker_dir *dir,
+		      struct ref_tracker **trackerp, gfp_t gfp);
+
+int ref_tracker_free(struct ref_tracker_dir *dir,
+		     struct ref_tracker **trackerp);
+
+#else /* CONFIG_REF_TRACKER */
+
+static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
+					unsigned int quarantine_count)
+{
+}
+
+static inline void ref_tracker_dir_exit(struct ref_tracker_dir *dir)
+{
+}
+
+static inline void ref_tracker_dir_print(struct ref_tracker_dir *dir,
+					 unsigned int display_limit)
+{
+}
+
+static inline int ref_tracker_alloc(struct ref_tracker_dir *dir,
+				    struct ref_tracker **trackerp,
+				    gfp_t gfp)
+{
+	return 0;
+}
+
+static inline int ref_tracker_free(struct ref_tracker_dir *dir,
+				   struct ref_tracker **trackerp)
+{
+	return 0;
+}
+
+#endif
+
+#endif /* _LINUX_REF_TRACKER_H */
-- 
cgit v1.2.3


From 4d92b95ff2f95f13df9bad0b5a25a9f60e72758d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 4 Dec 2021 20:21:57 -0800
Subject: net: add net device refcount tracker infrastructure

net device are refcounted. Over the years we had numerous bugs
caused by imbalanced dev_hold() and dev_put() calls.

The general idea is to be able to precisely pair each decrement with
a corresponding prior increment. Both share a cookie, basically
a pointer to private data storing stack traces.

This patch adds dev_hold_track() and dev_put_track().

To use these helpers, each data structure owning a refcount
should also use a "netdevice_tracker" to pair the hold and put.

netdevice_tracker dev_tracker;
...
dev_hold_track(dev, &dev_tracker, GFP_ATOMIC);
...
dev_put_track(dev, &dev_tracker);

Whenever a leak happens, we will get precise stack traces
of the point dev_hold_track() happened, at device dismantle phase.

We will also get a stack trace if too many dev_put_track() for the same
netdevice_tracker are attempted.

This is guarded by CONFIG_NET_DEV_REFCNT_TRACKER option.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 65117f01d5f2..143d60ed0047 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -48,6 +48,7 @@
 #include <uapi/linux/pkt_cls.h>
 #include <linux/hashtable.h>
 #include <linux/rbtree.h>
+#include <linux/ref_tracker.h>
 
 struct netpoll_info;
 struct device;
@@ -300,6 +301,12 @@ enum netdev_state_t {
 };
 
 
+#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
+typedef struct ref_tracker *netdevice_tracker;
+#else
+typedef struct {} netdevice_tracker;
+#endif
+
 struct gro_list {
 	struct list_head	list;
 	int			count;
@@ -1865,6 +1872,7 @@ enum netdev_ml_priv_type {
  *	@proto_down_reason:	reason a netdev interface is held down
  *	@pcpu_refcnt:		Number of references to this device
  *	@dev_refcnt:		Number of references to this device
+ *	@refcnt_tracker:	Tracker directory for tracked references to this device
  *	@todo_list:		Delayed register/unregister
  *	@link_watch_list:	XXX: need comments on this one
  *
@@ -2178,6 +2186,7 @@ struct net_device {
 #else
 	refcount_t		dev_refcnt;
 #endif
+	struct ref_tracker_dir	refcnt_tracker;
 
 	struct list_head	link_watch_list;
 
@@ -3805,6 +3814,7 @@ void netdev_run_todo(void);
  *	@dev: network device
  *
  * Release reference to device to allow it to be freed.
+ * Try using dev_put_track() instead.
  */
 static inline void dev_put(struct net_device *dev)
 {
@@ -3822,6 +3832,7 @@ static inline void dev_put(struct net_device *dev)
  *	@dev: network device
  *
  * Hold reference to device to keep it from being freed.
+ * Try using dev_hold_track() instead.
  */
 static inline void dev_hold(struct net_device *dev)
 {
@@ -3834,6 +3845,40 @@ static inline void dev_hold(struct net_device *dev)
 	}
 }
 
+static inline void netdev_tracker_alloc(struct net_device *dev,
+					netdevice_tracker *tracker, gfp_t gfp)
+{
+#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
+	ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp);
+#endif
+}
+
+static inline void netdev_tracker_free(struct net_device *dev,
+				       netdevice_tracker *tracker)
+{
+#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
+	ref_tracker_free(&dev->refcnt_tracker, tracker);
+#endif
+}
+
+static inline void dev_hold_track(struct net_device *dev,
+				  netdevice_tracker *tracker, gfp_t gfp)
+{
+	if (dev) {
+		dev_hold(dev);
+		netdev_tracker_alloc(dev, tracker, gfp);
+	}
+}
+
+static inline void dev_put_track(struct net_device *dev,
+				 netdevice_tracker *tracker)
+{
+	if (dev) {
+		netdev_tracker_free(dev, tracker);
+		dev_put(dev);
+	}
+}
+
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
  * and _off may be called from IRQ context, but it is caller
  * who is responsible for serialization of these calls.
-- 
cgit v1.2.3


From 80e8921b2b72c300ca56a01729004d30bedb82cd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 4 Dec 2021 20:21:58 -0800
Subject: net: add net device refcount tracker to struct netdev_rx_queue

This helps debugging net device refcount leaks.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 143d60ed0047..3d691fadd569 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -741,6 +741,8 @@ struct netdev_rx_queue {
 #endif
 	struct kobject			kobj;
 	struct net_device		*dev;
+	netdevice_tracker		dev_tracker;
+
 #ifdef CONFIG_XDP_SOCKETS
 	struct xsk_buff_pool            *pool;
 #endif
-- 
cgit v1.2.3


From 0b688f24b7d611db3a02f3d4ab562d049c78a17d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 4 Dec 2021 20:21:59 -0800
Subject: net: add net device refcount tracker to struct netdev_queue

This will help debugging pesky netdev reference leaks.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3d691fadd569..b4f704337f65 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -586,6 +586,8 @@ struct netdev_queue {
  * read-mostly part
  */
 	struct net_device	*dev;
+	netdevice_tracker	dev_tracker;
+
 	struct Qdisc __rcu	*qdisc;
 	struct Qdisc		*qdisc_sleeping;
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3


From 9038c320001dd07f60736018edf608ac5baca0ab Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 4 Dec 2021 20:22:03 -0800
Subject: net: dst: add net device refcount tracking to dst_entry

We want to track all dev_hold()/dev_put() to ease leak hunting.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b4f704337f65..afed3b10491b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3883,6 +3883,23 @@ static inline void dev_put_track(struct net_device *dev,
 	}
 }
 
+static inline void dev_replace_track(struct net_device *odev,
+				     struct net_device *ndev,
+				     netdevice_tracker *tracker,
+				     gfp_t gfp)
+{
+#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
+	if (odev)
+		ref_tracker_free(&odev->refcnt_tracker, tracker);
+#endif
+	dev_hold(ndev);
+	dev_put(odev);
+#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
+	if (ndev)
+		ref_tracker_alloc(&ndev->refcnt_tracker, tracker, gfp);
+#endif
+}
+
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
  * and _off may be called from IRQ context, but it is caller
  * who is responsible for serialization of these calls.
-- 
cgit v1.2.3


From c04438f58d140723e58050fcb9d33d84cb39e9e9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 4 Dec 2021 20:22:12 -0800
Subject: ipv4: add net device refcount tracker to struct in_device

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/inetdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 518b484a7f07..674aeead6260 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -24,6 +24,8 @@ struct ipv4_devconf {
 
 struct in_device {
 	struct net_device	*dev;
+	netdevice_tracker	dev_tracker;
+
 	refcount_t		refcnt;
 	int			dead;
 	struct in_ifaddr	__rcu *ifa_list;/* IP ifaddr chain		*/
-- 
cgit v1.2.3


From 63f13937cbe9b00982dfc8e578b1aec8e5037333 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 4 Dec 2021 20:22:14 -0800
Subject: net: linkwatch: add net device refcount tracker

Add a netdevice_tracker inside struct net_device, to track
the self reference when a device is in lweventlist.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index afed3b10491b..69dca1edd5a6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1950,6 +1950,7 @@ enum netdev_ml_priv_type {
  *			keep a list of interfaces to be deleted.
  *
  *	@dev_addr_shadow:	Copy of @dev_addr to catch direct writes.
+ *	@linkwatch_dev_tracker:	refcount tracker used by linkwatch.
  *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
@@ -2280,6 +2281,7 @@ struct net_device {
 	struct bpf_xdp_entity	xdp_state[__MAX_XDP_MODE];
 
 	u8 dev_addr_shadow[MAX_ADDR_LEN];
+	netdevice_tracker	linkwatch_dev_tracker;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
-- 
cgit v1.2.3


From 42120a86438379eb77424831ae3d696c2d5cb622 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 4 Dec 2021 20:22:16 -0800
Subject: ipmr, ip6mr: add net device refcount tracker to struct vif_device

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mroute_base.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 8071148f29a6..e05ee9f001ff 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -12,6 +12,7 @@
 /**
  * struct vif_device - interface representor for multicast routing
  * @dev: network device being used
+ * @dev_tracker: refcount tracker for @dev reference
  * @bytes_in: statistic; bytes ingressing
  * @bytes_out: statistic; bytes egresing
  * @pkt_in: statistic; packets ingressing
@@ -26,6 +27,7 @@
  */
 struct vif_device {
 	struct net_device *dev;
+	netdevice_tracker dev_tracker;
 	unsigned long bytes_in, bytes_out;
 	unsigned long pkt_in, pkt_out;
 	unsigned long rate_limit;
-- 
cgit v1.2.3


From 5fa5ae605821e0e10ee489d9a6e331fd287ccc57 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 4 Dec 2021 20:22:17 -0800
Subject: netpoll: add net device refcount tracker to struct netpoll

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netpoll.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index e6a2d72e0dc7..bd19c4b91e31 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -24,6 +24,7 @@ union inet_addr {
 
 struct netpoll {
 	struct net_device *dev;
+	netdevice_tracker dev_tracker;
 	char dev_name[IFNAMSIZ];
 	const char *name;
 
-- 
cgit v1.2.3


From 45cac6754529ae17345d8f5b632d9e602a091a20 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 3 Dec 2021 20:53:56 -0800
Subject: net: fix recent csum changes

Vladimir reported csum issues after my recent change in skb_postpull_rcsum()

Issue here is the following:

initial skb->csum is the csum of

[part to be pulled][rest of packet]

Old code:
 skb->csum = csum_sub(skb->csum, csum_partial(pull, pull_length, 0));

New code:
 skb->csum = ~csum_partial(pull, pull_length, ~skb->csum);

This is broken if the csum of [pulled part]
happens to be equal to skb->csum, because end
result of skb->csum is 0 in new code, instead
of being 0xffffffff

David Laight suggested to use

skb->csum = -csum_partial(pull, pull_length, -skb->csum);

I based my patches on existing code present in include/net/seg6.h,
update_csum_diff4() and update_csum_diff16() which might need
a similar fix.

I guess that my tests, mostly pulling 40 bytes of IPv6 header
were not providing enough entropy to hit this bug.

v2: added wsum_negate() to make sparse happy.

Fixes: 29c3002644bd ("net: optimize skb_postpull_rcsum()")
Fixes: 0bd28476f636 ("gro: optimize skb_gro_postpull_rcsum()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Suggested-by: David Laight <David.Laight@ACULAB.COM>
Cc: David Lebrun <dlebrun@google.com>
Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20211204045356.3659278-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index eae4bd3237a4..dd262bd8ddbe 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3486,7 +3486,8 @@ static inline void skb_postpull_rcsum(struct sk_buff *skb,
 				      const void *start, unsigned int len)
 {
 	if (skb->ip_summed == CHECKSUM_COMPLETE)
-		skb->csum = ~csum_partial(start, len, ~skb->csum);
+		skb->csum = wsum_negate(csum_partial(start, len,
+						     wsum_negate(skb->csum)));
 	else if (skb->ip_summed == CHECKSUM_PARTIAL &&
 		 skb_checksum_start_offset(skb) < 0)
 		skb->ip_summed = CHECKSUM_NONE;
-- 
cgit v1.2.3


From f0e6e6fa41b3d2aa1dcb61dd4ed6d7be004bb5a8 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 18 Oct 2021 17:14:07 +0200
Subject: KVM: Drop stale kvm_is_transparent_hugepage() declaration

kvm_is_transparent_hugepage() was removed in commit 205d76ff0684 ("KVM:
Remove kvm_is_transparent_hugepage() and PageTransCompoundMap()") but its
declaration in include/linux/kvm_host.h persisted. Drop it.

Fixes: 205d76ff0684 (""KVM: Remove kvm_is_transparent_hugepage() and PageTransCompoundMap()")
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211018151407.2107363-1-vkuznets@redhat.com
---
 include/linux/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c310648cc8f1..6d138adc78af 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1174,7 +1174,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 
 bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn);
-bool kvm_is_transparent_hugepage(kvm_pfn_t pfn);
 
 struct kvm_irq_ack_notifier {
 	struct hlist_node link;
-- 
cgit v1.2.3


From 77993b595ada5731e513eb06a0f4bf4b9f1e9532 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 29 Nov 2021 18:46:54 +0100
Subject: locking: Allow to include asm/spinlock_types.h from
 linux/spinlock_types_raw.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The printk header file includes ratelimit_types.h for its __ratelimit()
based usage. It is required for the static initializer used in
printk_ratelimited(). It uses a raw_spinlock_t and includes the
spinlock_types.h.

PREEMPT_RT substitutes spinlock_t with a rtmutex based implementation and so
its spinlock_t implmentation (provided by spinlock_rt.h) includes rtmutex.h and
atomic.h which leads to recursive includes where defines are missing.

By including only the raw_spinlock_t defines it avoids the atomic.h
related includes at this stage.

An example on powerpc:

|  CALL    scripts/atomic/check-atomics.sh
|In file included from include/linux/bug.h:5,
|                 from include/linux/page-flags.h:10,
|                 from kernel/bounds.c:10:
|arch/powerpc/include/asm/page_32.h: In function âclear_pageâ:
|arch/powerpc/include/asm/bug.h:87:4: error: implicit declaration of function â=80=98__WARNâ=80=99 [-Werror=3Dimplicit-function-declaration]
|   87 |    __WARN();    \
|      |    ^~~~~~
|arch/powerpc/include/asm/page_32.h:48:2: note: in expansion of macro âWARN_ONâ=99
|   48 |  WARN_ON((unsigned long)addr & (L1_CACHE_BYTES - 1));
|      |  ^~~~~~~
|arch/powerpc/include/asm/bug.h:58:17: error: invalid application of âsizeofâ=99 to incomplete type âstruct bug_entryâ=99
|   58 |     "i" (sizeof(struct bug_entry)), \
|      |                 ^~~~~~
|arch/powerpc/include/asm/bug.h:89:3: note: in expansion of macro âBUG_ENTRYâ=99
|   89 |   BUG_ENTRY(PPC_TLNEI " %4, 0",   \
|      |   ^~~~~~~~~
|arch/powerpc/include/asm/page_32.h:48:2: note: in expansion of macro âWARN_ONâ=99
|   48 |  WARN_ON((unsigned long)addr & (L1_CACHE_BYTES - 1));
|      |  ^~~~~~~
|In file included from arch/powerpc/include/asm/ptrace.h:298,
|                 from arch/powerpc/include/asm/hw_irq.h:12,
|                 from arch/powerpc/include/asm/irqflags.h:12,
|                 from include/linux/irqflags.h:16,
|                 from include/asm-generic/cmpxchg-local.h:6,
|                 from arch/powerpc/include/asm/cmpxchg.h:526,
|                 from arch/powerpc/include/asm/atomic.h:11,
|                 from include/linux/atomic.h:7,
|                 from include/linux/rwbase_rt.h:6,
|                 from include/linux/rwlock_types.h:55,
|                 from include/linux/spinlock_types.h:74,
|                 from include/linux/ratelimit_types.h:7,
|                 from include/linux/printk.h:10,
|                 from include/asm-generic/bug.h:22,
|                 from arch/powerpc/include/asm/bug.h:109,
|                 from include/linux/bug.h:5,
|                 from include/linux/page-flags.h:10,
|                 from kernel/bounds.c:10:
|include/linux/thread_info.h: In function â=80=98copy_overflowâ=80=99:
|include/linux/thread_info.h:210:2: error: implicit declaration of function â=80=98WARNâ=80=99 [-Werror=3Dimplicit-function-declaration]
|  210 |  WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
|      |  ^~~~

The WARN / BUG include pulls in printk.h and then ptrace.h expects WARN
(from bug.h) which is not yet complete. Even hw_irq.h has WARN_ON()
statements.

On POWERPC64 there are missing atomic64 defines while building 32bit
VDSO:
|  VDSO32C arch/powerpc/kernel/vdso32/vgettimeofday.o
|In file included from include/linux/atomic.h:80,
|                 from include/linux/rwbase_rt.h:6,
|                 from include/linux/rwlock_types.h:55,
|                 from include/linux/spinlock_types.h:74,
|                 from include/linux/ratelimit_types.h:7,
|                 from include/linux/printk.h:10,
|                 from include/linux/kernel.h:19,
|                 from arch/powerpc/include/asm/page.h:11,
|                 from arch/powerpc/include/asm/vdso/gettimeofday.h:5,
|                 from include/vdso/datapage.h:137,
|                 from lib/vdso/gettimeofday.c:5,
|                 from <command-line>:
|include/linux/atomic-arch-fallback.h: In function âarch_atomic64_incâ=99:
|include/linux/atomic-arch-fallback.h:1447:2: error: implicit declaration of function âarch_atomic64_addâ; did you mean âarch_atomic_addâ? [-Werror=3Dimpl
|icit-function-declaration]
| 1447 |  arch_atomic64_add(1, v);
|      |  ^~~~~~~~~~~~~~~~~
|      |  arch_atomic_add

The generic fallback is not included, atomics itself are not used. If
kernel.h does not include printk.h then it comes later from the bug.h
include.

Allow asm/spinlock_types.h to be included from
linux/spinlock_types_raw.h.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211129174654.668506-12-bigeasy@linutronix.de
---
 include/linux/ratelimit_types.h   | 2 +-
 include/linux/spinlock_types_up.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h
index b676aa419eef..c21c7f8103e2 100644
--- a/include/linux/ratelimit_types.h
+++ b/include/linux/ratelimit_types.h
@@ -4,7 +4,7 @@
 
 #include <linux/bits.h>
 #include <linux/param.h>
-#include <linux/spinlock_types.h>
+#include <linux/spinlock_types_raw.h>
 
 #define DEFAULT_RATELIMIT_INTERVAL	(5 * HZ)
 #define DEFAULT_RATELIMIT_BURST		10
diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
index c09b6407ae1b..7f86a2016ac5 100644
--- a/include/linux/spinlock_types_up.h
+++ b/include/linux/spinlock_types_up.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_SPINLOCK_TYPES_UP_H
 #define __LINUX_SPINLOCK_TYPES_UP_H
 
-#ifndef __LINUX_SPINLOCK_TYPES_H
+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
 # error "please don't include this file directly"
 #endif
 
-- 
cgit v1.2.3


From 8d9f738f16a3ee9f2341578873c542ddd9802fe4 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng01@gmail.com>
Date: Tue, 7 Dec 2021 20:32:30 +0800
Subject: regulator: fix bullet lists of regulator_ops comment

Since 89a6a5e56c82("regulator: add property parsing and callbacks to set protection limits")
which introduced a warning:

Documentation/driver-api/regulator:166: ./include/linux/regulator/driver.h:96: WARNING: Unexpected indentation.
Documentation/driver-api/regulator:166: ./include/linux/regulator/driver.h:98: WARNING: Block quote ends without a blank line; unexpected unindent.

Let's fix them.

Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Link: https://lore.kernel.org/r/20211207123230.2262047-1-siyanteng@loongson.cn
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 4078c7776453..720684995a77 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -90,15 +90,19 @@ enum regulator_detection_severity {
  * @set_over_current_protection: Support enabling of and setting limits for over
  *	current situation detection. Detection can be configured for three
  *	levels of severity.
- *	REGULATOR_SEVERITY_PROT should automatically shut down the regulator(s).
- *	REGULATOR_SEVERITY_ERR should indicate that over-current situation is
- *		caused by an unrecoverable error but HW does not perform
- *		automatic shut down.
- *	REGULATOR_SEVERITY_WARN should indicate situation where hardware is
- *		still believed to not be damaged but that a board sepcific
- *		recovery action is needed. If lim_uA is 0 the limit should not
- *		be changed but the detection should just be enabled/disabled as
- *		is requested.
+ *
+ *	- REGULATOR_SEVERITY_PROT should automatically shut down the regulator(s).
+ *
+ *	- REGULATOR_SEVERITY_ERR should indicate that over-current situation is
+ *		  caused by an unrecoverable error but HW does not perform
+ *		  automatic shut down.
+ *
+ *	- REGULATOR_SEVERITY_WARN should indicate situation where hardware is
+ *		  still believed to not be damaged but that a board sepcific
+ *		  recovery action is needed. If lim_uA is 0 the limit should not
+ *		  be changed but the detection should just be enabled/disabled as
+ *		  is requested.
+ *
  * @set_over_voltage_protection: Support enabling of and setting limits for over
  *	voltage situation detection. Detection can be configured for same
  *	severities as over current protection. Units of uV.
-- 
cgit v1.2.3


From 13244cccc2b61ec715f0ac583d3037497004d4a5 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 1 Dec 2021 10:54:52 -0800
Subject: skbuff: introduce skb_pull_data

Like skb_pull but returns the original data pointer before pulling the
data after performing a check against sbk->len.

This allows to change code that does "struct foo *p = (void *)skb->data;"
which is hard to audit and error prone, to:

        p = skb_pull_data(skb, sizeof(*p));
        if (!p)
                return;

Which is both safer and cleaner.

Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/skbuff.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index eba256af64a5..877dda38684a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2373,6 +2373,8 @@ static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len)
 	return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
 }
 
+void *skb_pull_data(struct sk_buff *skb, size_t len);
+
 void *__pskb_pull_tail(struct sk_buff *skb, int delta);
 
 static inline void *__pskb_pull(struct sk_buff *skb, unsigned int len)
-- 
cgit v1.2.3


From 6fadaa565882cd7afc501de5921db6f5e45c784b Mon Sep 17 00:00:00 2001
From: Maxim Galaganov <max@internet.ru>
Date: Fri, 3 Dec 2021 14:35:39 -0800
Subject: tcp: expose __tcp_sock_set_cork and __tcp_sock_set_nodelay

Expose __tcp_sock_set_cork() and __tcp_sock_set_nodelay() for use in
MPTCP setsockopt code -- namely for syncing MPTCP socket options with
subflows inside sync_socket_options() while already holding the subflow
socket lock.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Maxim Galaganov <max@internet.ru>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/tcp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 48d8a363319e..78b91bb92f0d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -512,11 +512,13 @@ static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
 int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 		  int shiftlen);
 
+void __tcp_sock_set_cork(struct sock *sk, bool on);
 void tcp_sock_set_cork(struct sock *sk, bool on);
 int tcp_sock_set_keepcnt(struct sock *sk, int val);
 int tcp_sock_set_keepidle_locked(struct sock *sk, int val);
 int tcp_sock_set_keepidle(struct sock *sk, int val);
 int tcp_sock_set_keepintvl(struct sock *sk, int val);
+void __tcp_sock_set_nodelay(struct sock *sk, bool on);
 void tcp_sock_set_nodelay(struct sock *sk);
 void tcp_sock_set_quickack(struct sock *sk, int val);
 int tcp_sock_set_syncnt(struct sock *sk, int val);
-- 
cgit v1.2.3


From 213d56bf33bdda835bac04046f09256a75c5ca8e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 19 Oct 2021 02:08:07 +0200
Subject: rcu/nocb: Prepare state machine for a new step

Currently SEGCBLIST_SOFTIRQ_ONLY is a bit of an exception among the
segcblist flags because it is an exclusive state that doesn't mix up
with the other flags. Remove it in favour of:

_ A flag specifying that rcu_core() needs to perform callbacks execution
  and acceleration

and

_ A flag specifying we want the nocb lock to be held in any needed
  circumstances

This clarifies the code and is more flexible: It allows to have a state
where rcu_core() runs with locking while offloading hasn't started yet.
This is a necessary step to prepare for triggering rcu_core() at the
very beginning of the de-offloading process so that rcu_core() won't
dismiss work while being preempted by the de-offloading process, at
least not without a pending subsequent rcu_core() that will quickly
catch up.

Reviewed-by: Valentin Schneider <Valentin.Schneider@arm.com>
Tested-by: Valentin Schneider <valentin.schneider@arm.com>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Uladzislau Rezki <urezki@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcu_segcblist.h | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 3db96c4f45fd..812961b1d064 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -69,7 +69,7 @@ struct rcu_cblist {
  *
  *
  *  ----------------------------------------------------------------------------
- *  |                         SEGCBLIST_SOFTIRQ_ONLY                           |
+ *  |                              SEGCBLIST_RCU_CORE                          |
  *  |                                                                          |
  *  |  Callbacks processed by rcu_core() from softirqs or local                |
  *  |  rcuc kthread, without holding nocb_lock.                                |
@@ -77,7 +77,7 @@ struct rcu_cblist {
  *                                         |
  *                                         v
  *  ----------------------------------------------------------------------------
- *  |                        SEGCBLIST_OFFLOADED                               |
+ *  |       SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED       |
  *  |                                                                          |
  *  | Callbacks processed by rcu_core() from softirqs or local                 |
  *  | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads,     |
@@ -89,7 +89,9 @@ struct rcu_cblist {
  *                        |                                 |
  *                        v                                 v
  *  ---------------------------------------  ----------------------------------|
- *  |        SEGCBLIST_OFFLOADED |        |  |     SEGCBLIST_OFFLOADED |       |
+ *  |        SEGCBLIST_RCU_CORE   |       |  |     SEGCBLIST_RCU_CORE   |      |
+ *  |        SEGCBLIST_LOCKING    |       |  |     SEGCBLIST_LOCKING    |      |
+ *  |        SEGCBLIST_OFFLOADED  |       |  |     SEGCBLIST_OFFLOADED  |      |
  *  |        SEGCBLIST_KTHREAD_CB         |  |     SEGCBLIST_KTHREAD_GP        |
  *  |                                     |  |                                 |
  *  |                                     |  |                                 |
@@ -104,9 +106,10 @@ struct rcu_cblist {
  *                                         |
  *                                         v
  *  |--------------------------------------------------------------------------|
- *  |                           SEGCBLIST_OFFLOADED |                          |
- *  |                           SEGCBLIST_KTHREAD_CB |                         |
- *  |                           SEGCBLIST_KTHREAD_GP                           |
+ *  |                           SEGCBLIST_LOCKING    |                         |
+ *  |                           SEGCBLIST_OFFLOADED  |                         |
+ *  |                           SEGCBLIST_KTHREAD_GP |                         |
+ *  |                           SEGCBLIST_KTHREAD_CB                           |
  *  |                                                                          |
  *  |   Kthreads handle callbacks holding nocb_lock, local rcu_core() stops    |
  *  |   handling callbacks. Enable bypass queueing.                            |
@@ -120,7 +123,8 @@ struct rcu_cblist {
  *
  *
  *  |--------------------------------------------------------------------------|
- *  |                           SEGCBLIST_OFFLOADED |                          |
+ *  |                           SEGCBLIST_LOCKING    |                         |
+ *  |                           SEGCBLIST_OFFLOADED  |                         |
  *  |                           SEGCBLIST_KTHREAD_CB |                         |
  *  |                           SEGCBLIST_KTHREAD_GP                           |
  *  |                                                                          |
@@ -130,6 +134,8 @@ struct rcu_cblist {
  *                                      |
  *                                      v
  *  |--------------------------------------------------------------------------|
+ *  |                           SEGCBLIST_RCU_CORE   |                         |
+ *  |                           SEGCBLIST_LOCKING    |                         |
  *  |                           SEGCBLIST_KTHREAD_CB |                         |
  *  |                           SEGCBLIST_KTHREAD_GP                           |
  *  |                                                                          |
@@ -143,7 +149,9 @@ struct rcu_cblist {
  *                     |                                 |
  *                     v                                 v
  *  ---------------------------------------------------------------------------|
- *  |                                                                          |
+ *  |                                     |                                    |
+ *  |        SEGCBLIST_RCU_CORE |         |       SEGCBLIST_RCU_CORE |         |
+ *  |        SEGCBLIST_LOCKING  |         |       SEGCBLIST_LOCKING  |         |
  *  |        SEGCBLIST_KTHREAD_CB         |       SEGCBLIST_KTHREAD_GP         |
  *  |                                     |                                    |
  *  | GP kthread woke up and              |   CB kthread woke up and           |
@@ -159,7 +167,7 @@ struct rcu_cblist {
  *                                      |
  *                                      v
  *  ----------------------------------------------------------------------------
- *  |                                   0                                      |
+ *  |                SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING                    |
  *  |                                                                          |
  *  | Callbacks processed by rcu_core() from softirqs or local                 |
  *  | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed.    |
@@ -168,17 +176,18 @@ struct rcu_cblist {
  *                                      |
  *                                      v
  *  ----------------------------------------------------------------------------
- *  |                         SEGCBLIST_SOFTIRQ_ONLY                           |
+ *  |                         SEGCBLIST_RCU_CORE                               |
  *  |                                                                          |
  *  |  Callbacks processed by rcu_core() from softirqs or local                |
  *  |  rcuc kthread, without holding nocb_lock.                                |
  *  ----------------------------------------------------------------------------
  */
 #define SEGCBLIST_ENABLED	BIT(0)
-#define SEGCBLIST_SOFTIRQ_ONLY	BIT(1)
-#define SEGCBLIST_KTHREAD_CB	BIT(2)
-#define SEGCBLIST_KTHREAD_GP	BIT(3)
-#define SEGCBLIST_OFFLOADED	BIT(4)
+#define SEGCBLIST_RCU_CORE	BIT(1)
+#define SEGCBLIST_LOCKING	BIT(2)
+#define SEGCBLIST_KTHREAD_CB	BIT(3)
+#define SEGCBLIST_KTHREAD_GP	BIT(4)
+#define SEGCBLIST_OFFLOADED	BIT(5)
 
 struct rcu_segcblist {
 	struct rcu_head *head;
-- 
cgit v1.2.3


From fbb94cbd70d41c7511460896dfc7f9ea5da704b3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 19 Oct 2021 02:08:08 +0200
Subject: rcu/nocb: Invoke rcu_core() at the start of deoffloading

On PREEMPT_RT, if rcu_core() is preempted by the de-offloading process,
some work, such as callbacks acceleration and invocation, may be left
unattended due to the volatile checks on the offloaded state.

In the worst case this work is postponed until the next rcu_pending()
check that can take a jiffy to reach, which can be a problem in case
of callbacks flooding.

Solve that with invoking rcu_core() early in the de-offloading process.
This way any work dismissed by an ongoing rcu_core() call fooled by
a preempting deoffloading process will be caught up by a nearby future
recall to rcu_core(), this time fully aware of the de-offloading state.

Tested-by: Valentin Schneider <valentin.schneider@arm.com>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Uladzislau Rezki <urezki@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcu_segcblist.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 812961b1d064..659d13a7ddaa 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -136,6 +136,20 @@ struct rcu_cblist {
  *  |--------------------------------------------------------------------------|
  *  |                           SEGCBLIST_RCU_CORE   |                         |
  *  |                           SEGCBLIST_LOCKING    |                         |
+ *  |                           SEGCBLIST_OFFLOADED  |                         |
+ *  |                           SEGCBLIST_KTHREAD_CB |                         |
+ *  |                           SEGCBLIST_KTHREAD_GP                           |
+ *  |                                                                          |
+ *  |   CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core()    |
+ *  |   handles callbacks concurrently. Bypass enqueue is enabled.             |
+ *  |   Invoke RCU core so we make sure not to preempt it in the middle with   |
+ *  |   leaving some urgent work unattended within a jiffy.                    |
+ *  ----------------------------------------------------------------------------
+ *                                      |
+ *                                      v
+ *  |--------------------------------------------------------------------------|
+ *  |                           SEGCBLIST_RCU_CORE   |                         |
+ *  |                           SEGCBLIST_LOCKING    |                         |
  *  |                           SEGCBLIST_KTHREAD_CB |                         |
  *  |                           SEGCBLIST_KTHREAD_GP                           |
  *  |                                                                          |
-- 
cgit v1.2.3


From 81faa4f6fba429334ff72bb5ba7696818509b5b5 Mon Sep 17 00:00:00 2001
From: Li Zhijian <zhijianx.li@intel.com>
Date: Wed, 3 Nov 2021 16:30:28 +0800
Subject: locktorture,rcutorture,torture: Always log error message

Unconditionally log messages corresponding to errors.

Acked-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Li Zhijian <zhijianx.li@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/torture.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 24f58e50a94b..63fa4196e51c 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -38,13 +38,8 @@ do {										\
 		pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s);		\
 	}									\
 } while (0)
-#define VERBOSE_TOROUT_ERRSTRING(s) \
-do {										\
-	if (verbose) {								\
-		verbose_torout_sleep();						\
-		pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s);	\
-	}									\
-} while (0)
+#define TOROUT_ERRSTRING(s) \
+	pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s)
 void verbose_torout_sleep(void);
 
 #define torture_init_error(firsterr)						\
-- 
cgit v1.2.3


From 08f0b22d731fa86957749c649d6ef6ebc07e8ad2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Dec 2021 17:30:27 -0800
Subject: net: eql: add net device refcount tracker

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/if_eql.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/if_eql.h b/include/linux/if_eql.h
index d984694c384d..d75601d613cc 100644
--- a/include/linux/if_eql.h
+++ b/include/linux/if_eql.h
@@ -26,6 +26,7 @@
 typedef struct slave {
 	struct list_head	list;
 	struct net_device	*dev;
+	netdevice_tracker	dev_tracker;
 	long			priority;
 	long			priority_bps;
 	long			priority_Bps;
-- 
cgit v1.2.3


From 19c9ebf6ed70856385296a65e78c1699081b152f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Dec 2021 17:30:28 -0800
Subject: vlan: add net device refcount tracker

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/if_vlan.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 41a518336673..8420fe504927 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -162,6 +162,7 @@ struct netpoll;
  *	@vlan_id: VLAN identifier
  *	@flags: device flags
  *	@real_dev: underlying netdevice
+ *	@dev_tracker: refcount tracker for @real_dev reference
  *	@real_dev_addr: address of underlying netdevice
  *	@dent: proc dir entry
  *	@vlan_pcpu_stats: ptr to percpu rx stats
@@ -177,6 +178,8 @@ struct vlan_dev_priv {
 	u16					flags;
 
 	struct net_device			*real_dev;
+	netdevice_tracker			dev_tracker;
+
 	unsigned char				real_dev_addr[ETH_ALEN];
 
 	struct proc_dir_entry			*dent;
-- 
cgit v1.2.3


From f12bf6f3f942b37de65eeea8be25903587fec930 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Dec 2021 17:30:30 -0800
Subject: net: watchdog: add net device refcount tracker

Add a netdevice_tracker inside struct net_device, to track
the self reference when a device has an active watchdog timer.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 69dca1edd5a6..1a748ee9a421 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1951,6 +1951,7 @@ enum netdev_ml_priv_type {
  *
  *	@dev_addr_shadow:	Copy of @dev_addr to catch direct writes.
  *	@linkwatch_dev_tracker:	refcount tracker used by linkwatch.
+ *	@watchdog_dev_tracker:	refcount tracker used by watchdog.
  *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
@@ -2282,6 +2283,7 @@ struct net_device {
 
 	u8 dev_addr_shadow[MAX_ADDR_LEN];
 	netdevice_tracker	linkwatch_dev_tracker;
+	netdevice_tracker	watchdog_dev_tracker;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
-- 
cgit v1.2.3


From a97770cc4016c2733bcef9dbe3d5b1ad02d13356 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng01@gmail.com>
Date: Mon, 6 Dec 2021 16:12:27 +0800
Subject: net: phy: Remove unnecessary indentation in the comments of
 phy_device

Fix warning as:

linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:543: WARNING: Unexpected indentation.
linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:544: WARNING: Block quote ends without a blank line; unexpected unindent.
linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:546: WARNING: Unexpected indentation.

Suggested-by: Akira Yokosawa <akiyks@gmail.com>
Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 96e43fbb2dd8..cbf03a5f9cf5 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -538,11 +538,12 @@ struct macsec_ops;
  * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY
  * @state: State of the PHY for management purposes
  * @dev_flags: Device-specific flags used by the PHY driver.
- *		Bits [15:0] are free to use by the PHY driver to communicate
- *			    driver specific behavior.
- *		Bits [23:16] are currently reserved for future use.
- *		Bits [31:24] are reserved for defining generic
- *			     PHY driver behavior.
+ *
+ *      - Bits [15:0] are free to use by the PHY driver to communicate
+ *        driver specific behavior.
+ *      - Bits [23:16] are currently reserved for future use.
+ *      - Bits [31:24] are reserved for defining generic
+ *        PHY driver behavior.
  * @irq: IRQ number of the PHY's interrupt (-1 if none)
  * @phy_timer: The timer for handling the state machine
  * @phylink: Pointer to phylink instance for this PHY
-- 
cgit v1.2.3


From 330c6d3bfa268794bf692165d0f781f1c2d4d83e Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Wed, 24 Nov 2021 10:45:36 +0900
Subject: can: bittiming: replace CAN units with the generic ones from
 linux/units.h

In [1], we introduced a set of units in linux/can/bittiming.h. Since
then, generic SI prefixes were added to linux/units.h in [2]. Those
new prefixes can perfectly replace CAN specific ones.

This patch replaces all occurrences of the CAN units with their
corresponding prefix (from linux/units) and the unit (as a comment)
according to below table.

 CAN units	SI metric prefix (from linux/units) + unit (as a comment)
 ------------------------------------------------------------------------
 CAN_KBPS	KILO /* BPS */
 CAN_MBPS	MEGA /* BPS */
 CAM_MHZ	MEGA /* Hz */

The definition are then removed from linux/can/bittiming.h

[1] commit 1d7750760b70 ("can: bittiming: add CAN_KBPS, CAN_MBPS and
CAN_MHZ macros")

[2] commit 26471d4a6cf8 ("units: Add SI metric prefix definitions")

Link: https://lore.kernel.org/all/20211124014536.782550-1-mailhol.vincent@wanadoo.fr
Suggested-by: Jimmy Assarsson <extja@kvaser.com>
Suggested-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/bittiming.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index 20b50baf3a02..a81652d1c6f3 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -12,13 +12,6 @@
 #define CAN_SYNC_SEG 1
 
 
-/* Kilobits and Megabits per second */
-#define CAN_KBPS 1000UL
-#define CAN_MBPS 1000000UL
-
-/* Megahertz */
-#define CAN_MHZ 1000000UL
-
 #define CAN_CTRLMODE_TDC_MASK					\
 	(CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_TDC_MANUAL)
 
-- 
cgit v1.2.3


From 27592ae8dbe41033261b6fdf27d78998aabd2665 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 16 Nov 2021 16:03:57 +0000
Subject: KVM: Move wiping of the kvm->vcpus array to common code

All architectures have similar loops iterating over the vcpus,
freeing one vcpu at a time, and eventually wiping the reference
off the vcpus array. They are also inconsistently taking
the kvm->lock mutex when wiping the references from the array.

Make this code common, which will simplify further changes.
The locking is dropped altogether, as this should only be called
when there is no further references on the kvm structure.

Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Message-Id: <20211116160403.4074052-2-maz@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c310648cc8f1..e2f9f8f67c58 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -733,7 +733,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
 		if (WARN_ON_ONCE(!memslot->npages)) {			\
 		} else
 
-void kvm_vcpu_destroy(struct kvm_vcpu *vcpu);
+void kvm_destroy_vcpus(struct kvm *kvm);
 
 void vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From c5b077549136584618a66258f09d8d4b41e7409c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 16 Nov 2021 16:04:01 +0000
Subject: KVM: Convert the kvm->vcpus array to a xarray

At least on arm64 and x86, the vcpus array is pretty huge (up to
1024 entries on x86) and is mostly empty in the majority of the cases
(running 1k vcpu VMs is not that common).

This mean that we end-up with a 4kB block of unused memory in the
middle of the kvm structure.

Instead of wasting away this memory, let's use an xarray instead,
which gives us almost the same flexibility as a normal array, but
with a reduced memory usage with smaller VMs.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Message-Id: <20211116160403.4074052-6-maz@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e2f9f8f67c58..2201dc07126a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -29,6 +29,7 @@
 #include <linux/refcount.h>
 #include <linux/nospec.h>
 #include <linux/notifier.h>
+#include <linux/xarray.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -552,7 +553,7 @@ struct kvm {
 	struct mutex slots_arch_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
-	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	struct xarray vcpu_array;
 
 	/* Used to wait for completion of MMU notifiers.  */
 	spinlock_t mn_invalidate_lock;
@@ -701,7 +702,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 
 	/* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu.  */
 	smp_rmb();
-	return kvm->vcpus[i];
+	return xa_load(&kvm->vcpu_array, i);
 }
 
 #define kvm_for_each_vcpu(idx, vcpup, kvm) \
-- 
cgit v1.2.3


From 46808a4cb89708c2e5b264eb9d1035762581921b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 16 Nov 2021 16:04:02 +0000
Subject: KVM: Use 'unsigned long' as kvm_for_each_vcpu()'s index

Everywhere we use kvm_for_each_vpcu(), we use an int as the vcpu
index. Unfortunately, we're about to move rework the iterator,
which requires this to be upgrade to an unsigned long.

Let's bite the bullet and repaint all of it in one go.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Message-Id: <20211116160403.4074052-7-maz@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2201dc07126a..7da6086262c6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -714,7 +714,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
 {
 	struct kvm_vcpu *vcpu = NULL;
-	int i;
+	unsigned long i;
 
 	if (id < 0)
 		return NULL;
-- 
cgit v1.2.3


From 214bd3a6f46981b7867946e1b4f628a06bcf2091 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 16 Nov 2021 16:04:03 +0000
Subject: KVM: Convert kvm_for_each_vcpu() to using xa_for_each_range()

Now that the vcpu array is backed by an xarray, use the optimised
iterator that matches the underlying data structure.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Message-Id: <20211116160403.4074052-8-maz@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7da6086262c6..66548287ed42 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -705,11 +705,9 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 	return xa_load(&kvm->vcpu_array, i);
 }
 
-#define kvm_for_each_vcpu(idx, vcpup, kvm) \
-	for (idx = 0; \
-	     idx < atomic_read(&kvm->online_vcpus) && \
-	     (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \
-	     idx++)
+#define kvm_for_each_vcpu(idx, vcpup, kvm)		   \
+	xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0, \
+			  (atomic_read(&kvm->online_vcpus) - 1))
 
 static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
 {
-- 
cgit v1.2.3


From afa319a54a8c760ba59683cd3c4318635049a664 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 6 Dec 2021 20:54:07 +0100
Subject: KVM: Require total number of memslot pages to fit in an unsigned long

Explicitly disallow creating more memslot pages than can fit in an
unsigned long, KVM doesn't correctly handle a total number of memslot
pages that doesn't fit in an unsigned long and remedying that would be a
waste of time.

For a 64-bit kernel, this is a nop as memslots are not allowed to overlap
in the gfn address space.

With a 32-bit kernel, userspace can at most address 3gb of virtual memory,
whereas wrapping the total number of pages would require 4tb+ of guest
physical memory.  Even with x86's second address space for SMM, userspace
would need to alias all of guest memory more than one _thousand_ times.
And on older x86 hardware with MAXPHYADDR < 43, the guest couldn't
actually access any of those aliases even if userspace lied about
guest.MAXPHYADDR.

On 390 and arm64, this is a nop as they don't support 32-bit hosts.

On x86, practically speaking this is simply acknowledging reality as the
existing kvm_mmu_calculate_default_mmu_pages() assumes the total number
of pages fits in an "unsigned long".

On PPC, this is likely a nop as every flavor of PPC KVM assumes gfns (and
gpas!) fit in unsigned long.  arch/powerpc/kvm/book3s_32_mmu_host.c goes
a step further and fails the build if CONFIG_PTE_64BIT=y, which
presumably means that it does't support 64-bit physical addresses.

On MIPS, this is also likely a nop as the core MMU helpers assume gpas
fit in unsigned long, e.g. see kvm_mips_##name##_pte.

And finally, RISC-V is a "don't care" as it doesn't exist in any release,
i.e. there is no established ABI to break.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Message-Id: <1c2c91baf8e78acccd4dad38da591002e61c013c.1638817638.git.maciej.szmigiero@oracle.com>
---
 include/linux/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 66548287ed42..e38705359af5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -552,6 +552,7 @@ struct kvm {
 	 */
 	struct mutex slots_arch_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
+	unsigned long nr_memslot_pages;
 	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
 	struct xarray vcpu_array;
 
-- 
cgit v1.2.3


From 537a17b3149300987456e8949ccb991e604047d6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 6 Dec 2021 20:54:11 +0100
Subject: KVM: Let/force architectures to deal with arch specific memslot data

Pass the "old" slot to kvm_arch_prepare_memory_region() and force arch
code to handle propagating arch specific data from "new" to "old" when
necessary.  This is a baby step towards dynamically allocating "new" from
the get go, and is a (very) minor performance boost on x86 due to not
unnecessarily copying arch data.

For PPC HV, copy the rmap in the !CREATE and !DELETE paths, i.e. for MOVE
and FLAGS_ONLY.  This is functionally a nop as the previous behavior
would overwrite the pointer for CREATE, and eventually discard/ignore it
for DELETE.

For x86, copy the arch data only for FLAGS_ONLY changes.  Unlike PPC HV,
x86 needs to reallocate arch data in the MOVE case as the size of x86's
allocations depend on the alignment of the memslot's gfn.

Opportunistically tweak kvm_arch_prepare_memory_region()'s param order to
match the "commit" prototype.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
[mss: add missing RISCV kvm_arch_prepare_memory_region() change]
Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Message-Id: <67dea5f11bbcfd71e3da5986f11e87f5dd4013f9.1638817639.git.maciej.szmigiero@oracle.com>
---
 include/linux/kvm_host.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e38705359af5..cb7311dc6f32 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -833,8 +833,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
-				struct kvm_memory_slot *memslot,
 				const struct kvm_userspace_memory_region *mem,
+				const struct kvm_memory_slot *old,
+				struct kvm_memory_slot *new,
 				enum kvm_mr_change change);
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				const struct kvm_userspace_memory_region *mem,
-- 
cgit v1.2.3


From 6a99c6e3f52a6f0d4c6ebcfa7359c718a19ffbe6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 6 Dec 2021 20:54:18 +0100
Subject: KVM: Stop passing kvm_userspace_memory_region to arch memslot hooks

Drop the @mem param from kvm_arch_{prepare,commit}_memory_region() now
that its use has been removed in all architectures.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Message-Id: <aa5ed3e62c27e881d0d8bc0acbc1572bc336dc19.1638817640.git.maciej.szmigiero@oracle.com>
---
 include/linux/kvm_host.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cb7311dc6f32..da0d4f21a150 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -833,12 +833,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
-				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
 				struct kvm_memory_slot *new,
 				enum kvm_mr_change change);
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				const struct kvm_userspace_memory_region *mem,
 				struct kvm_memory_slot *old,
 				const struct kvm_memory_slot *new,
 				enum kvm_mr_change change);
-- 
cgit v1.2.3


From c928bfc2632fa3dd6a3bd4504ac6d8e42302287a Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Mon, 6 Dec 2021 20:54:25 +0100
Subject: KVM: Integrate gfn_to_memslot_approx() into search_memslots()

s390 arch has gfn_to_memslot_approx() which is almost identical to
search_memslots(), differing only in that in case the gfn falls in a hole
one of the memslots bordering the hole is returned.

Add this lookup mode as an option to search_memslots() so we don't have two
almost identical functions for looking up a memslot by its gfn.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
[sean: tweaked helper names to keep gfn_to_memslot_approx() in s390]
Reviewed-by: Sean Christopherson <seanjc@google.com>
Message-Id: <171cd89b52c718dbe180ecd909b4437a64a7e2ec.1638817640.git.maciej.szmigiero@oracle.com>
---
 include/linux/kvm_host.h | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index da0d4f21a150..2f80ce84fbcf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1233,10 +1233,14 @@ try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn)
  * Returns a pointer to the memslot that contains gfn and records the index of
  * the slot in index. Otherwise returns NULL.
  *
+ * With "approx" set returns the memslot also when the address falls
+ * in a hole. In that case one of the memslots bordering the hole is
+ * returned.
+ *
  * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!
  */
 static inline struct kvm_memory_slot *
-search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index)
+search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index, bool approx)
 {
 	int start = 0, end = slots->used_slots;
 	struct kvm_memory_slot *memslots = slots->memslots;
@@ -1254,22 +1258,26 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index)
 			start = slot + 1;
 	}
 
+	if (approx && start >= slots->used_slots) {
+		*index = slots->used_slots - 1;
+		return &memslots[slots->used_slots - 1];
+	}
+
 	slot = try_get_memslot(slots, start, gfn);
 	if (slot) {
 		*index = start;
 		return slot;
 	}
+	if (approx) {
+		*index = start;
+		return &memslots[start];
+	}
 
 	return NULL;
 }
 
-/*
- * __gfn_to_memslot() and its descendants are here because it is called from
- * non-modular code in arch/powerpc/kvm/book3s_64_vio{,_hv}.c. gfn_to_memslot()
- * itself isn't here as an inline because that would bloat other code too much.
- */
 static inline struct kvm_memory_slot *
-__gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
+____gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn, bool approx)
 {
 	struct kvm_memory_slot *slot;
 	int slot_index = atomic_read(&slots->last_used_slot);
@@ -1278,7 +1286,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
 	if (slot)
 		return slot;
 
-	slot = search_memslots(slots, gfn, &slot_index);
+	slot = search_memslots(slots, gfn, &slot_index, approx);
 	if (slot) {
 		atomic_set(&slots->last_used_slot, slot_index);
 		return slot;
@@ -1287,6 +1295,17 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
 	return NULL;
 }
 
+/*
+ * __gfn_to_memslot() and its descendants are here to allow arch code to inline
+ * the lookups in hot paths.  gfn_to_memslot() itself isn't here as an inline
+ * because that would bloat other code too much.
+ */
+static inline struct kvm_memory_slot *
+__gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
+{
+	return ____gfn_to_memslot(slots, gfn, false);
+}
+
 static inline unsigned long
 __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
-- 
cgit v1.2.3


From 26b8345abc75a7404716864710930407b7d873f9 Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Mon, 6 Dec 2021 20:54:27 +0100
Subject: KVM: Resolve memslot ID via a hash table instead of via a static
 array

Memslot ID to the corresponding memslot mappings are currently kept as
indices in static id_to_index array.
The size of this array depends on the maximum allowed memslot count
(regardless of the number of memslots actually in use).

This has become especially problematic recently, when memslot count cap was
removed, so the maximum count is now full 32k memslots - the maximum
allowed by the current KVM API.

Keeping these IDs in a hash table (instead of an array) avoids this
problem.

Resolving a memslot ID to the actual memslot (instead of its index) will
also enable transitioning away from an array-based implementation of the
whole memslots structure in a later commit.

Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Message-Id: <117fb2c04320e6cd6cf34f205a72eadb0aa8d5f9.1638817640.git.maciej.szmigiero@oracle.com>
---
 include/linux/kvm_host.h | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2f80ce84fbcf..79db70a8323e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -29,6 +29,7 @@
 #include <linux/refcount.h>
 #include <linux/nospec.h>
 #include <linux/notifier.h>
+#include <linux/hashtable.h>
 #include <linux/xarray.h>
 #include <asm/signal.h>
 
@@ -426,6 +427,7 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
 #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
 
 struct kvm_memory_slot {
+	struct hlist_node id_node;
 	gfn_t base_gfn;
 	unsigned long npages;
 	unsigned long *dirty_bitmap;
@@ -527,8 +529,15 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
  */
 struct kvm_memslots {
 	u64 generation;
-	/* The mapping table from slot id to the index in memslots[]. */
-	short id_to_index[KVM_MEM_SLOTS_NUM];
+	/*
+	 * The mapping table from slot id to the index in memslots[].
+	 *
+	 * 7-bit bucket count matches the size of the old id to index array for
+	 * 512 slots, while giving good performance with this slot count.
+	 * Higher bucket counts bring only small performance improvements but
+	 * always result in higher memory usage (even for lower memslot counts).
+	 */
+	DECLARE_HASHTABLE(id_hash, 7);
 	atomic_t last_used_slot;
 	int used_slots;
 	struct kvm_memory_slot memslots[];
@@ -796,16 +805,14 @@ static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu)
 static inline
 struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id)
 {
-	int index = slots->id_to_index[id];
 	struct kvm_memory_slot *slot;
 
-	if (index < 0)
-		return NULL;
-
-	slot = &slots->memslots[index];
+	hash_for_each_possible(slots->id_hash, slot, id_node, id) {
+		if (slot->id == id)
+			return slot;
+	}
 
-	WARN_ON(slot->id != id);
-	return slot;
+	return NULL;
 }
 
 /*
-- 
cgit v1.2.3


From ed922739c9199bf515a3e7fec3e319ce1edeef2a Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Mon, 6 Dec 2021 20:54:28 +0100
Subject: KVM: Use interval tree to do fast hva lookup in memslots

The current memslots implementation only allows quick binary search by gfn,
quick lookup by hva is not possible - the implementation has to do a linear
scan of the whole memslots array, even though the operation being performed
might apply just to a single memslot.

This significantly hurts performance of per-hva operations with higher
memslot counts.

Since hva ranges can overlap between memslots an interval tree is needed
for tracking them.

[sean: handle interval tree updates in kvm_replace_memslot()]
Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Message-Id: <d66b9974becaa9839be9c4e1a5de97b177b4ac20.1638817640.git.maciej.szmigiero@oracle.com>
---
 include/linux/kvm_host.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 79db70a8323e..9552ad6d6652 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -30,6 +30,7 @@
 #include <linux/nospec.h>
 #include <linux/notifier.h>
 #include <linux/hashtable.h>
+#include <linux/interval_tree.h>
 #include <linux/xarray.h>
 #include <asm/signal.h>
 
@@ -428,6 +429,7 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
 
 struct kvm_memory_slot {
 	struct hlist_node id_node;
+	struct interval_tree_node hva_node;
 	gfn_t base_gfn;
 	unsigned long npages;
 	unsigned long *dirty_bitmap;
@@ -529,6 +531,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
  */
 struct kvm_memslots {
 	u64 generation;
+	struct rb_root_cached hva_tree;
 	/*
 	 * The mapping table from slot id to the index in memslots[].
 	 *
-- 
cgit v1.2.3


From a54d806688fe1e482350ce759a8a0fc9ebf814b0 Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Mon, 6 Dec 2021 20:54:30 +0100
Subject: KVM: Keep memslots in tree-based structures instead of array-based
 ones

The current memslot code uses a (reverse gfn-ordered) memslot array for
keeping track of them.

Because the memslot array that is currently in use cannot be modified
every memslot management operation (create, delete, move, change flags)
has to make a copy of the whole array so it has a scratch copy to work on.

Strictly speaking, however, it is only necessary to make copy of the
memslot that is being modified, copying all the memslots currently present
is just a limitation of the array-based memslot implementation.

Two memslot sets, however, are still needed so the VM continues to run
on the currently active set while the requested operation is being
performed on the second, currently inactive one.

In order to have two memslot sets, but only one copy of actual memslots
it is necessary to split out the memslot data from the memslot sets.

The memslots themselves should be also kept independent of each other
so they can be individually added or deleted.

These two memslot sets should normally point to the same set of
memslots. They can, however, be desynchronized when performing a
memslot management operation by replacing the memslot to be modified
by its copy.  After the operation is complete, both memslot sets once
again point to the same, common set of memslot data.

This commit implements the aforementioned idea.

For tracking of gfns an ordinary rbtree is used since memslots cannot
overlap in the guest address space and so this data structure is
sufficient for ensuring that lookups are done quickly.

The "last used slot" mini-caches (both per-slot set one and per-vCPU one),
that keep track of the last found-by-gfn memslot, are still present in the
new code.

Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Message-Id: <17c0cf3663b760a0d3753d4ac08c0753e941b811.1638817641.git.maciej.szmigiero@oracle.com>
---
 include/linux/kvm_host.h | 143 +++++++++++++++++++++++------------------------
 1 file changed, 71 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9552ad6d6652..9eda8a63feae 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -31,6 +31,7 @@
 #include <linux/notifier.h>
 #include <linux/hashtable.h>
 #include <linux/interval_tree.h>
+#include <linux/rbtree.h>
 #include <linux/xarray.h>
 #include <asm/signal.h>
 
@@ -358,11 +359,13 @@ struct kvm_vcpu {
 	struct kvm_dirty_ring dirty_ring;
 
 	/*
-	 * The index of the most recently used memslot by this vCPU. It's ok
-	 * if this becomes stale due to memslot changes since we always check
-	 * it is a valid slot.
+	 * The most recently used memslot by this vCPU and the slots generation
+	 * for which it is valid.
+	 * No wraparound protection is needed since generations won't overflow in
+	 * thousands of years, even assuming 1M memslot operations per second.
 	 */
-	int last_used_slot;
+	struct kvm_memory_slot *last_used_slot;
+	u64 last_used_slot_gen;
 };
 
 /* must be called with irqs disabled */
@@ -427,9 +430,26 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
  */
 #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
 
+/*
+ * Since at idle each memslot belongs to two memslot sets it has to contain
+ * two embedded nodes for each data structure that it forms a part of.
+ *
+ * Two memslot sets (one active and one inactive) are necessary so the VM
+ * continues to run on one memslot set while the other is being modified.
+ *
+ * These two memslot sets normally point to the same set of memslots.
+ * They can, however, be desynchronized when performing a memslot management
+ * operation by replacing the memslot to be modified by its copy.
+ * After the operation is complete, both memslot sets once again point to
+ * the same, common set of memslot data.
+ *
+ * The memslots themselves are independent of each other so they can be
+ * individually added or deleted.
+ */
 struct kvm_memory_slot {
-	struct hlist_node id_node;
-	struct interval_tree_node hva_node;
+	struct hlist_node id_node[2];
+	struct interval_tree_node hva_node[2];
+	struct rb_node gfn_node[2];
 	gfn_t base_gfn;
 	unsigned long npages;
 	unsigned long *dirty_bitmap;
@@ -524,16 +544,13 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
 }
 #endif
 
-/*
- * Note:
- * memslots are not sorted by id anymore, please use id_to_memslot()
- * to get the memslot by its id.
- */
 struct kvm_memslots {
 	u64 generation;
+	atomic_long_t last_used_slot;
 	struct rb_root_cached hva_tree;
+	struct rb_root gfn_tree;
 	/*
-	 * The mapping table from slot id to the index in memslots[].
+	 * The mapping table from slot id to memslot.
 	 *
 	 * 7-bit bucket count matches the size of the old id to index array for
 	 * 512 slots, while giving good performance with this slot count.
@@ -541,9 +558,7 @@ struct kvm_memslots {
 	 * always result in higher memory usage (even for lower memslot counts).
 	 */
 	DECLARE_HASHTABLE(id_hash, 7);
-	atomic_t last_used_slot;
-	int used_slots;
-	struct kvm_memory_slot memslots[];
+	int node_idx;
 };
 
 struct kvm {
@@ -565,6 +580,9 @@ struct kvm {
 	struct mutex slots_arch_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	unsigned long nr_memslot_pages;
+	/* The two memslot sets - active and inactive (per address space) */
+	struct kvm_memslots __memslots[KVM_ADDRESS_SPACE_NUM][2];
+	/* The current active memslot set for each address space */
 	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
 	struct xarray vcpu_array;
 
@@ -739,11 +757,10 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
 	return NULL;
 }
 
-#define kvm_for_each_memslot(memslot, slots)				\
-	for (memslot = &slots->memslots[0];				\
-	     memslot < slots->memslots + slots->used_slots; memslot++)	\
-		if (WARN_ON_ONCE(!memslot->npages)) {			\
-		} else
+static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
+{
+	return vcpu->vcpu_idx;
+}
 
 void kvm_destroy_vcpus(struct kvm *kvm);
 
@@ -805,12 +822,23 @@ static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu)
 	return __kvm_memslots(vcpu->kvm, as_id);
 }
 
+static inline bool kvm_memslots_empty(struct kvm_memslots *slots)
+{
+	return RB_EMPTY_ROOT(&slots->gfn_tree);
+}
+
+#define kvm_for_each_memslot(memslot, bkt, slots)			      \
+	hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \
+		if (WARN_ON_ONCE(!memslot->npages)) {			      \
+		} else
+
 static inline
 struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id)
 {
 	struct kvm_memory_slot *slot;
+	int idx = slots->node_idx;
 
-	hash_for_each_possible(slots->id_hash, slot, id_node, id) {
+	hash_for_each_possible(slots->id_hash, slot, id_node[idx], id) {
 		if (slot->id == id)
 			return slot;
 	}
@@ -1214,25 +1242,15 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
 
 /*
- * Returns a pointer to the memslot at slot_index if it contains gfn.
+ * Returns a pointer to the memslot if it contains gfn.
  * Otherwise returns NULL.
  */
 static inline struct kvm_memory_slot *
-try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn)
+try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	struct kvm_memory_slot *slot;
-
-	if (slot_index < 0 || slot_index >= slots->used_slots)
+	if (!slot)
 		return NULL;
 
-	/*
-	 * slot_index can come from vcpu->last_used_slot which is not kept
-	 * in sync with userspace-controllable memslot deletion. So use nospec
-	 * to prevent the CPU from speculating past the end of memslots[].
-	 */
-	slot_index = array_index_nospec(slot_index, slots->used_slots);
-	slot = &slots->memslots[slot_index];
-
 	if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages)
 		return slot;
 	else
@@ -1240,65 +1258,46 @@ try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn)
 }
 
 /*
- * Returns a pointer to the memslot that contains gfn and records the index of
- * the slot in index. Otherwise returns NULL.
+ * Returns a pointer to the memslot that contains gfn. Otherwise returns NULL.
  *
  * With "approx" set returns the memslot also when the address falls
  * in a hole. In that case one of the memslots bordering the hole is
  * returned.
- *
- * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!
  */
 static inline struct kvm_memory_slot *
-search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index, bool approx)
+search_memslots(struct kvm_memslots *slots, gfn_t gfn, bool approx)
 {
-	int start = 0, end = slots->used_slots;
-	struct kvm_memory_slot *memslots = slots->memslots;
 	struct kvm_memory_slot *slot;
-
-	if (unlikely(!slots->used_slots))
-		return NULL;
-
-	while (start < end) {
-		int slot = start + (end - start) / 2;
-
-		if (gfn >= memslots[slot].base_gfn)
-			end = slot;
-		else
-			start = slot + 1;
-	}
-
-	if (approx && start >= slots->used_slots) {
-		*index = slots->used_slots - 1;
-		return &memslots[slots->used_slots - 1];
-	}
-
-	slot = try_get_memslot(slots, start, gfn);
-	if (slot) {
-		*index = start;
-		return slot;
-	}
-	if (approx) {
-		*index = start;
-		return &memslots[start];
+	struct rb_node *node;
+	int idx = slots->node_idx;
+
+	slot = NULL;
+	for (node = slots->gfn_tree.rb_node; node; ) {
+		slot = container_of(node, struct kvm_memory_slot, gfn_node[idx]);
+		if (gfn >= slot->base_gfn) {
+			if (gfn < slot->base_gfn + slot->npages)
+				return slot;
+			node = node->rb_right;
+		} else
+			node = node->rb_left;
 	}
 
-	return NULL;
+	return approx ? slot : NULL;
 }
 
 static inline struct kvm_memory_slot *
 ____gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn, bool approx)
 {
 	struct kvm_memory_slot *slot;
-	int slot_index = atomic_read(&slots->last_used_slot);
 
-	slot = try_get_memslot(slots, slot_index, gfn);
+	slot = (struct kvm_memory_slot *)atomic_long_read(&slots->last_used_slot);
+	slot = try_get_memslot(slot, gfn);
 	if (slot)
 		return slot;
 
-	slot = search_memslots(slots, gfn, &slot_index, approx);
+	slot = search_memslots(slots, gfn, approx);
 	if (slot) {
-		atomic_set(&slots->last_used_slot, slot_index);
+		atomic_long_set(&slots->last_used_slot, (unsigned long)slot);
 		return slot;
 	}
 
-- 
cgit v1.2.3


From f4209439b522432d140d33393d4a3f12e695527b Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Mon, 6 Dec 2021 20:54:32 +0100
Subject: KVM: Optimize gfn lookup in kvm_zap_gfn_range()

Introduce a memslots gfn upper bound operation and use it to optimize
kvm_zap_gfn_range().
This way this handler can do a quick lookup for intersecting gfns and won't
have to do a linear scan of the whole memslot set.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Message-Id: <ef242146a87a335ee93b441dcf01665cb847c902.1638817641.git.maciej.szmigiero@oracle.com>
---
 include/linux/kvm_host.h | 94 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9eda8a63feae..3bc98497e796 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -846,6 +846,100 @@ struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id)
 	return NULL;
 }
 
+/* Iterator used for walking memslots that overlap a gfn range. */
+struct kvm_memslot_iter {
+	struct kvm_memslots *slots;
+	struct rb_node *node;
+	struct kvm_memory_slot *slot;
+};
+
+static inline void kvm_memslot_iter_next(struct kvm_memslot_iter *iter)
+{
+	iter->node = rb_next(iter->node);
+	if (!iter->node)
+		return;
+
+	iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[iter->slots->node_idx]);
+}
+
+static inline void kvm_memslot_iter_start(struct kvm_memslot_iter *iter,
+					  struct kvm_memslots *slots,
+					  gfn_t start)
+{
+	int idx = slots->node_idx;
+	struct rb_node *tmp;
+	struct kvm_memory_slot *slot;
+
+	iter->slots = slots;
+
+	/*
+	 * Find the so called "upper bound" of a key - the first node that has
+	 * its key strictly greater than the searched one (the start gfn in our case).
+	 */
+	iter->node = NULL;
+	for (tmp = slots->gfn_tree.rb_node; tmp; ) {
+		slot = container_of(tmp, struct kvm_memory_slot, gfn_node[idx]);
+		if (start < slot->base_gfn) {
+			iter->node = tmp;
+			tmp = tmp->rb_left;
+		} else {
+			tmp = tmp->rb_right;
+		}
+	}
+
+	/*
+	 * Find the slot with the lowest gfn that can possibly intersect with
+	 * the range, so we'll ideally have slot start <= range start
+	 */
+	if (iter->node) {
+		/*
+		 * A NULL previous node means that the very first slot
+		 * already has a higher start gfn.
+		 * In this case slot start > range start.
+		 */
+		tmp = rb_prev(iter->node);
+		if (tmp)
+			iter->node = tmp;
+	} else {
+		/* a NULL node below means no slots */
+		iter->node = rb_last(&slots->gfn_tree);
+	}
+
+	if (iter->node) {
+		iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[idx]);
+
+		/*
+		 * It is possible in the slot start < range start case that the
+		 * found slot ends before or at range start (slot end <= range start)
+		 * and so it does not overlap the requested range.
+		 *
+		 * In such non-overlapping case the next slot (if it exists) will
+		 * already have slot start > range start, otherwise the logic above
+		 * would have found it instead of the current slot.
+		 */
+		if (iter->slot->base_gfn + iter->slot->npages <= start)
+			kvm_memslot_iter_next(iter);
+	}
+}
+
+static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_t end)
+{
+	if (!iter->node)
+		return false;
+
+	/*
+	 * If this slot starts beyond or at the end of the range so does
+	 * every next one
+	 */
+	return iter->slot->base_gfn < end;
+}
+
+/* Iterate over each memslot at least partially intersecting [start, end) range */
+#define kvm_for_each_memslot_in_gfn_range(iter, slots, start, end)	\
+	for (kvm_memslot_iter_start(iter, slots, start);		\
+	     kvm_memslot_iter_is_valid(iter, end);			\
+	     kvm_memslot_iter_next(iter))
+
 /*
  * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations:
  * - create a new memory slot
-- 
cgit v1.2.3


From 8283e36abfff507c64fe8289ac30ea7ab59648aa Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon@google.com>
Date: Mon, 15 Nov 2021 15:45:58 -0800
Subject: KVM: x86/mmu: Propagate memslot const qualifier

In preparation for implementing in-place hugepage promotion, various
functions will need to be called from zap_collapsible_spte_range, which
has the const qualifier on its memslot argument. Propagate the const
qualifier to the various functions which will be needed. This just serves
to simplify the following patch.

No functional change intended.

Signed-off-by: Ben Gardon <bgardon@google.com>
Message-Id: <20211115234603.2908381-11-bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3bc98497e796..3eb7695aaa73 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -460,7 +460,7 @@ struct kvm_memory_slot {
 	u16 as_id;
 };
 
-static inline bool kvm_slot_dirty_track_enabled(struct kvm_memory_slot *slot)
+static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
 {
 	return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
 }
@@ -994,9 +994,9 @@ void kvm_set_page_accessed(struct page *page);
 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable);
-kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
-kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
-kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
+kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn);
+kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn);
+kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
 			       bool atomic, bool *async, bool write_fault,
 			       bool *writable, hva_t *hva);
 
@@ -1073,7 +1073,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn);
-void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn);
+void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
 struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From 510958e997217e39a16b47afb5a44dfa39013964 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 8 Oct 2021 19:11:57 -0700
Subject: KVM: Force PPC to define its own rcuwait object

Do not define/reference kvm_vcpu.wait if __KVM_HAVE_ARCH_WQP is true, and
instead force the architecture (PPC) to define its own rcuwait object.
Allowing common KVM to directly access vcpu->wait without a guard makes
it all too easy to introduce potential bugs, e.g. kvm_vcpu_block(),
kvm_vcpu_on_spin(), and async_pf_execute() all operate on vcpu->wait, not
the result of kvm_arch_vcpu_get_wait(), and so may do the wrong thing for
PPC.

Due to PPC's shenanigans with respect to callbacks and waits (it switches
to the virtual core's wait object at KVM_RUN!?!?), it's not clear whether
or not this fixes any bugs.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20211009021236.4122790-5-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3eb7695aaa73..afacbfb2e482 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -314,7 +314,9 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	struct kvm_run *run;
 
+#ifndef __KVM_HAVE_ARCH_WQP
 	struct rcuwait wait;
+#endif
 	struct pid __rcu *pid;
 	int sigset_active;
 	sigset_t sigset;
-- 
cgit v1.2.3


From 91b99ea7065786d0bff1c9281b002455dbaeb08b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 8 Oct 2021 19:12:06 -0700
Subject: KVM: Rename kvm_vcpu_block() => kvm_vcpu_halt()

Rename kvm_vcpu_block() to kvm_vcpu_halt() in preparation for splitting
the actual "block" sequences into a separate helper (to be named
kvm_vcpu_block()).  x86 will use the standalone block-only path to handle
non-halt cases where the vCPU is not runnable.

Rename block_ns to halt_ns to match the new function name.

No functional change intended.

Reviewed-by: David Matlack <dmatlack@google.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20211009021236.4122790-14-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index afacbfb2e482..ea3c22d55d56 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1102,7 +1102,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 void kvm_sigset_activate(struct kvm_vcpu *vcpu);
 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
 
-void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_vcpu_halt(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From fac4268894394213127e43856f41d10f29131e69 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 8 Oct 2021 19:12:07 -0700
Subject: KVM: Split out a kvm_vcpu_block() helper from kvm_vcpu_halt()

Factor out the "block" part of kvm_vcpu_halt() so that x86 can emulate
non-halt wait/sleep/block conditions that should not be subjected to
halt-polling.

No functional change intended.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20211009021236.4122790-15-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ea3c22d55d56..bd13c5b5bd1d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1103,6 +1103,7 @@ void kvm_sigset_activate(struct kvm_vcpu *vcpu);
 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
 
 void kvm_vcpu_halt(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From c3858335c711569b82a234a560dc19247e8f3fcc Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Fri, 8 Oct 2021 19:12:08 -0700
Subject: KVM: stats: Add stat to detect if vcpu is currently blocking

Add a "blocking" stat that userspace can use to detect the case where a
vCPU is not being run because of an vCPU/guest action, e.g. HLT or WFS on
x86, WFI on arm64, etc...  Current guest/host/halt stats don't show this
well, e.g. if a guest halts for a long period of time then the vCPU could
could appear pathologically blocked due to a host condition, when in
reality the vCPU has been put into a not-runnable state by the guest.

Originally-by: Cannon Matthews <cannonmatthews@google.com>
Suggested-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Jing Zhang <jingzhangos@google.com>
[sean: renamed stat to "blocking", massaged changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20211009021236.4122790-16-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h  | 3 ++-
 include/linux/kvm_types.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bd13c5b5bd1d..dc7740cafea7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1587,7 +1587,8 @@ struct _kvm_stats_desc {
 	STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist,	       \
 			HALT_POLL_HIST_COUNT),				       \
 	STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist,	       \
-			HALT_POLL_HIST_COUNT)
+			HALT_POLL_HIST_COUNT),				       \
+	STATS_DESC_ICOUNTER(VCPU_GENERIC, blocking)
 
 extern struct dentry *kvm_debugfs_dir;
 
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 234eab059839..888ef12862c9 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -87,6 +87,7 @@ struct kvm_vcpu_stat_generic {
 	u64 halt_poll_success_hist[HALT_POLL_HIST_COUNT];
 	u64 halt_poll_fail_hist[HALT_POLL_HIST_COUNT];
 	u64 halt_wait_hist[HALT_POLL_HIST_COUNT];
+	u64 blocking;
 };
 
 #define KVM_STATS_NAME_SIZE	48
-- 
cgit v1.2.3


From d92a5d1c6c757f659ffb9c2c2e65fcf3d571c14e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 8 Oct 2021 19:12:12 -0700
Subject: KVM: Add helpers to wake/query blocking vCPU

Add helpers to wake and query a blocking vCPU.  In addition to providing
nice names, the helpers reduce the probability of KVM neglecting to use
kvm_arch_vcpu_get_wait().

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20211009021236.4122790-20-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index dc7740cafea7..f8ed799e8674 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1286,6 +1286,20 @@ static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu)
 #endif
 }
 
+/*
+ * Wake a vCPU if necessary, but don't do any stats/metadata updates.  Returns
+ * true if the vCPU was blocking and was awakened, false otherwise.
+ */
+static inline bool __kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
+{
+	return !!rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu));
+}
+
+static inline bool kvm_vcpu_is_blocking(struct kvm_vcpu *vcpu)
+{
+	return rcuwait_active(kvm_arch_vcpu_get_wait(vcpu));
+}
+
 #ifdef __KVM_HAVE_ARCH_INTC_INITIALIZED
 /*
  * returns true if the virtual interrupt controller is initialized and
-- 
cgit v1.2.3


From 815f0e738a8d5663a02350e2580706829144a722 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Wed, 3 Nov 2021 09:50:59 +0100
Subject: clk: gate: Add devm_clk_hw_register_gate()

Add devm_clk_hw_register_gate() - devres-managed version of
clk_hw_register_gate()

Suggested-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Signed-off-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Link: https://lore.kernel.org/r/20211103085102.1656081-2-horatiu.vultur@microchip.com
---
 include/linux/clk-provider.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index f59c875271a0..2faa6f7aa8a8 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -490,6 +490,13 @@ struct clk_hw *__clk_hw_register_gate(struct device *dev,
 		unsigned long flags,
 		void __iomem *reg, u8 bit_idx,
 		u8 clk_gate_flags, spinlock_t *lock);
+struct clk_hw *__devm_clk_hw_register_gate(struct device *dev,
+		struct device_node *np, const char *name,
+		const char *parent_name, const struct clk_hw *parent_hw,
+		const struct clk_parent_data *parent_data,
+		unsigned long flags,
+		void __iomem *reg, u8 bit_idx,
+		u8 clk_gate_flags, spinlock_t *lock);
 struct clk *clk_register_gate(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 bit_idx,
@@ -544,6 +551,22 @@ struct clk *clk_register_gate(struct device *dev, const char *name,
 	__clk_hw_register_gate((dev), NULL, (name), NULL, NULL, (parent_data), \
 			       (flags), (reg), (bit_idx),		      \
 			       (clk_gate_flags), (lock))
+/**
+ * devm_clk_hw_register_gate - register a gate clock with the clock framework
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_name: name of this clock's parent
+ * @flags: framework-specific flags for this clock
+ * @reg: register address to control gating of this clock
+ * @bit_idx: which bit in the register controls gating of this clock
+ * @clk_gate_flags: gate-specific flags for this clock
+ * @lock: shared register lock for this clock
+ */
+#define devm_clk_hw_register_gate(dev, name, parent_name, flags, reg, bit_idx,\
+				  clk_gate_flags, lock)			      \
+	__devm_clk_hw_register_gate((dev), NULL, (name), (parent_name), NULL, \
+			       NULL, (flags), (reg), (bit_idx),		      \
+			       (clk_gate_flags), (lock))
 void clk_unregister_gate(struct clk *clk);
 void clk_hw_unregister_gate(struct clk_hw *hw);
 int clk_gate_is_enabled(struct clk_hw *hw);
-- 
cgit v1.2.3


From 444dd878e85fb33fcfb2682cfdab4c236f33ea3e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 3 Dec 2021 17:19:47 +0100
Subject: PM: runtime: Fix pm_runtime_active() kerneldoc comment

The kerneldoc comment of pm_runtime_active() does not reflect the
behavior of the function, so update it accordingly.

Fixes: 403d2d116ec0 ("PM: runtime: Add kerneldoc comments to multiple helpers")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/pm_runtime.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 222da43b7096..eddd66d426ca 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -129,7 +129,7 @@ static inline bool pm_runtime_suspended(struct device *dev)
  * pm_runtime_active - Check whether or not a device is runtime-active.
  * @dev: Target device.
  *
- * Return %true if runtime PM is enabled for @dev and its runtime PM status is
+ * Return %true if runtime PM is disabled for @dev or its runtime PM status is
  * %RPM_ACTIVE, or %false otherwise.
  *
  * Note that the return value of this function can only be trusted if it is
-- 
cgit v1.2.3


From 74d9555580c48a04b2c3b742dfb0c80777aa0b26 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 8 Nov 2021 16:09:41 +0000
Subject: PM: hibernate: Allow ACPI hardware signature to be honoured

Theoretically, when the hardware signature in FACS changes, the OS
is supposed to gracefully decline to attempt to resume from S4:

 "If the signature has changed, OSPM will not restore the system
  context and can boot from scratch"

In practice, Windows doesn't do this and many laptop vendors do allow
the signature to change especially when docking/undocking, so it would
be a bad idea to simply comply with the specification by default in the
general case.

However, there are use cases where we do want the compliant behaviour
and we know it's safe. Specifically, when resuming virtual machines where
we know the hypervisor has changed sufficiently that resume will fail.
We really want to be able to *tell* the guest kernel not to try, so it
boots cleanly and doesn't just crash. This patch provides a way to opt
in to the spec-compliant behaviour on the command line.

A follow-up patch may do this automatically for certain "known good"
machines based on a DMI match, or perhaps just for all hypervisor
guests since there's no good reason a hypervisor would change the
hardware_signature that it exposes to guests *unless* it wants them
to obey the ACPI specification.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h    | 2 +-
 include/linux/suspend.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b28f8790192a..6c0798db6bde 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -506,7 +506,7 @@ acpi_status acpi_release_memory(acpi_handle handle, struct resource *res,
 int acpi_resources_are_enforced(void);
 
 #ifdef CONFIG_HIBERNATION
-void __init acpi_no_s4_hw_signature(void);
+void __init acpi_check_s4_hw_signature(int check);
 #endif
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 8af13ba60c7e..5785d909c321 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -446,6 +446,7 @@ extern unsigned long get_safe_page(gfp_t gfp_mask);
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
+extern u32 swsusp_hardware_signature;
 extern void hibernation_set_ops(const struct platform_hibernation_ops *ops);
 extern int hibernate(void);
 extern bool system_entering_hibernation(void);
-- 
cgit v1.2.3


From 8260b9820f7050461b8969305bbd8cb5654f0c74 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Mon, 6 Dec 2021 16:55:03 +0300
Subject: x86/sev: Use CC_ATTR attribute to generalize string I/O unroll

INS/OUTS are not supported in TDX guests and cause #UD. Kernel has to
avoid them when running in TDX guest. To support existing usage, string
I/O operations are unrolled using IN/OUT instructions.

AMD SEV platform implements this support by adding unroll
logic in ins#bwl()/outs#bwl() macros with SEV-specific checks.
Since TDX VM guests will also need similar support, use
CC_ATTR_GUEST_UNROLL_STRING_IO and generic cc_platform_has() API to
implement it.

String I/O helpers were the last users of sev_key_active() interface and
sev_enable_key static key. Remove them.

 [ bp: Move comment too and do not delete it. ]

Suggested-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Tested-by: Tom Lendacky <thomas.lendacky@amd.com>
Link: https://lkml.kernel.org/r/20211206135505.75045-2-kirill.shutemov@linux.intel.com
---
 include/linux/cc_platform.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h
index a075b70b9a70..efd8205282da 100644
--- a/include/linux/cc_platform.h
+++ b/include/linux/cc_platform.h
@@ -61,6 +61,17 @@ enum cc_attr {
 	 * Examples include SEV-ES.
 	 */
 	CC_ATTR_GUEST_STATE_ENCRYPT,
+
+	/**
+	 * @CC_ATTR_GUEST_UNROLL_STRING_IO: String I/O is implemented with
+	 *                                  IN/OUT instructions
+	 *
+	 * The platform/OS is running as a guest/virtual machine and uses
+	 * IN/OUT instructions in place of string I/O.
+	 *
+	 * Examples include TDX guest & SEV.
+	 */
+	CC_ATTR_GUEST_UNROLL_STRING_IO,
 };
 
 #ifdef CONFIG_ARCH_HAS_CC_PLATFORM
-- 
cgit v1.2.3


From 3f9bb0301d50ce27421eff4b710c2bbe58111a83 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 6 Dec 2021 18:57:47 +0200
Subject: net: dsa: make dp->bridge_num one-based
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I have seen too many bugs already due to the fact that we must encode an
invalid dp->bridge_num as a negative value, because the natural tendency
is to check that invalid value using (!dp->bridge_num). Latest example
can be seen in commit 1bec0f05062c ("net: dsa: fix bridge_num not
getting cleared after ports leaving the bridge").

Convert the existing users to assume that dp->bridge_num == 0 is the
encoding for invalid, and valid bridge numbers start from 1.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Alvin Šipraga <alsi@bang-olufsen.dk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dsa/8021q.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 254b165f2b44..0af4371fbebb 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -38,13 +38,13 @@ void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id);
 
 int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port,
 					struct net_device *br,
-					int bridge_num);
+					unsigned int bridge_num);
 
 void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port,
 					   struct net_device *br,
-					   int bridge_num);
+					   unsigned int bridge_num);
 
-u16 dsa_8021q_bridge_tx_fwd_offload_vid(int bridge_num);
+u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num);
 
 u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp);
 
-- 
cgit v1.2.3


From d3eed0e57d5d1bcbf1bd60f83a4adfe7d7b8dd9c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 6 Dec 2021 18:57:56 +0200
Subject: net: dsa: keep the bridge_dev and bridge_num as part of the same
 structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The main desire behind this is to provide coherent bridge information to
the fast path without locking.

For example, right now we set dp->bridge_dev and dp->bridge_num from
separate code paths, it is theoretically possible for a packet
transmission to read these two port properties consecutively and find a
bridge number which does not correspond with the bridge device.

Another desire is to start passing more complex bridge information to
dsa_switch_ops functions. For example, with FDB isolation, it is
expected that drivers will need to be passed the bridge which requested
an FDB/MDB entry to be offloaded, and along with that bridge_dev, the
associated bridge_num should be passed too, in case the driver might
want to implement an isolation scheme based on that number.

We already pass the {bridge_dev, bridge_num} pair to the TX forwarding
offload switch API, however we'd like to remove that and squash it into
the basic bridge join/leave API. So that means we need to pass this
pair to the bridge join/leave API.

During dsa_port_bridge_leave, first we unset dp->bridge_dev, then we
call the driver's .port_bridge_leave with what used to be our
dp->bridge_dev, but provided as an argument.

When bridge_dev and bridge_num get folded into a single structure, we
need to preserve this behavior in dsa_port_bridge_leave: we need a copy
of what used to be in dp->bridge.

Switch drivers check bridge membership by comparing dp->bridge_dev with
the provided bridge_dev, but now, if we provide the struct dsa_bridge as
a pointer, they cannot keep comparing dp->bridge to the provided
pointer, since this only points to an on-stack copy. To make this
obvious and prevent driver writers from forgetting and doing stupid
things, in this new API, the struct dsa_bridge is provided as a full
structure (not very large, contains an int and a pointer) instead of a
pointer. An explicit comparison function needs to be used to determine
bridge membership: dsa_port_offloads_bridge().

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Alvin Šipraga <alsi@bang-olufsen.dk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dsa/8021q.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 0af4371fbebb..939a1beaddf7 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -7,6 +7,7 @@
 
 #include <linux/refcount.h>
 #include <linux/types.h>
+#include <net/dsa.h>
 
 struct dsa_switch;
 struct dsa_port;
@@ -37,12 +38,10 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id);
 
 int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port,
-					struct net_device *br,
-					unsigned int bridge_num);
+					struct dsa_bridge bridge);
 
 void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port,
-					   struct net_device *br,
-					   unsigned int bridge_num);
+					   struct dsa_bridge bridge);
 
 u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num);
 
-- 
cgit v1.2.3


From 283e6f5a8166f075eba78da6b867d76cc5d47e77 Mon Sep 17 00:00:00 2001
From: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Date: Tue, 7 Dec 2021 12:21:40 +0300
Subject: net: wwan: make debugfs optional

Debugfs interface is optional for the regular modem use. Some distros
and users will want to disable this feature for security or kernel
size reasons. So add a configuration option that allows to completely
disable the debugfs interface of the WWAN devices.

A primary considered use case for this option was embedded firmwares.
For example, in OpenWrt, you can not completely disable debugfs, as a
lot of wireless stuff can only be configured and monitored with the
debugfs knobs. At the same time, reducing the size of a kernel and
modules is an essential task in the world of embedded software.
Disabling the WWAN and IOSM debugfs interfaces allows us to save 50K
(x86-64 build) of space for module storage. Not much, but already
considerable when you only have 16MB of storage.

So it is hard to just disable whole debugfs. Users need some fine
grained set of options to control which debugfs interface is important
and should be available and which is not.

The new configuration symbol is enabled by default and is hidden under
the EXPERT option. So a regular user would not be bothered by another
one configuration question. While an embedded distro maintainer will be
able to a little more reduce the final image size.

Signed-off-by: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Loic Poulain <loic.poulain@linaro.org>
Acked-by: M Chetan Kumar <m.chetan.kumar@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/wwan.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index 1646aa3e6779..e143c88bf4b0 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -171,6 +171,13 @@ int wwan_register_ops(struct device *parent, const struct wwan_ops *ops,
 
 void wwan_unregister_ops(struct device *parent);
 
+#ifdef CONFIG_WWAN_DEBUGFS
 struct dentry *wwan_get_debugfs_dir(struct device *parent);
+#else
+static inline struct dentry *wwan_get_debugfs_dir(struct device *parent)
+{
+	return ERR_PTR(-ENODEV);
+}
+#endif
 
 #endif /* __WWAN_H */
-- 
cgit v1.2.3


From 1197528aaea79ed4909aba695d18fdecc5387a36 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:28 +0100
Subject: genirq/msi: Guard sysfs code

No point in building unused code when CONFIG_SYSFS=n.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20211206210223.985907940@linutronix.de
---
 include/linux/msi.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index e616f94c7c58..d43b9469c88b 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -239,9 +239,19 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void pci_msi_mask_irq(struct irq_data *data);
 void pci_msi_unmask_irq(struct irq_data *data);
 
+#ifdef CONFIG_SYSFS
 const struct attribute_group **msi_populate_sysfs(struct device *dev);
 void msi_destroy_sysfs(struct device *dev,
 		       const struct attribute_group **msi_irq_groups);
+#else
+static inline const struct attribute_group **msi_populate_sysfs(struct device *dev)
+{
+	return NULL;
+}
+static inline void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups)
+{
+}
+#endif
 
 /*
  * The arch hooks to setup up msi irqs. Default functions are implemented
-- 
cgit v1.2.3


From 1dd2c6a0817fd08f80dee75d7d3bd99a0c4b828d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:29 +0100
Subject: genirq/msi: Remove unused domain callbacks

No users and there is no need to grow them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211126223824.322987915@linutronix.de
Link: https://lore.kernel.org/r/20211206210224.041777889@linutronix.de
---
 include/linux/msi.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index d43b9469c88b..4b962f73f84a 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -304,7 +304,6 @@ struct msi_domain_info;
  * @msi_free:		Domain specific function to free a MSI interrupts
  * @msi_check:		Callback for verification of the domain/info/dev data
  * @msi_prepare:	Prepare the allocation of the interrupts in the domain
- * @msi_finish:		Optional callback to finalize the allocation
  * @set_desc:		Set the msi descriptor for an interrupt
  * @handle_error:	Optional error handler if the allocation fails
  * @domain_alloc_irqs:	Optional function to override the default allocation
@@ -312,12 +311,11 @@ struct msi_domain_info;
  * @domain_free_irqs:	Optional function to override the default free
  *			function.
  *
- * @get_hwirq, @msi_init and @msi_free are callbacks used by
- * msi_create_irq_domain() and related interfaces
+ * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying
+ * irqdomain.
  *
- * @msi_check, @msi_prepare, @msi_finish, @set_desc and @handle_error
- * are callbacks used by msi_domain_alloc_irqs() and related
- * interfaces which are based on msi_desc.
+ * @msi_check, @msi_prepare, @handle_error and @set_desc are callbacks used by
+ * msi_domain_alloc/free_irqs().
  *
  * @domain_alloc_irqs, @domain_free_irqs can be used to override the
  * default allocation/free functions (__msi_domain_alloc/free_irqs). This
@@ -351,7 +349,6 @@ struct msi_domain_ops {
 	int		(*msi_prepare)(struct irq_domain *domain,
 				       struct device *dev, int nvec,
 				       msi_alloc_info_t *arg);
-	void		(*msi_finish)(msi_alloc_info_t *arg, int retval);
 	void		(*set_desc)(msi_alloc_info_t *arg,
 				    struct msi_desc *desc);
 	int		(*handle_error)(struct irq_domain *domain,
-- 
cgit v1.2.3


From 3ba1f050c91d5ce3672dbf3a55dc2451c0b342e2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:31 +0100
Subject: genirq/msi: Fixup includes

Remove the kobject.h include from msi.h as it's not required and add a
sysfs.h include to the core code instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20211206210224.103502021@linutronix.de
---
 include/linux/msi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 4b962f73f84a..5c627750f269 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -2,7 +2,7 @@
 #ifndef LINUX_MSI_H
 #define LINUX_MSI_H
 
-#include <linux/kobject.h>
+#include <linux/cpumask.h>
 #include <linux/list.h>
 #include <asm/msi.h>
 
-- 
cgit v1.2.3


From 9e8688c5f2990dadcf83728cd00a7e8497fc6aa9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:33 +0100
Subject: PCI/MSI: Make pci_msi_domain_write_msg() static

There is no point to have this function public as it is set by the PCI core
anyway when a PCI/MSI irqdomain is created.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>	# PCI
Link: https://lore.kernel.org/r/20211206210224.157070464@linutronix.de
---
 include/linux/msi.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 5c627750f269..d7b143a79cb4 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -455,7 +455,6 @@ void *platform_msi_get_host_data(struct irq_domain *domain);
 #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
 
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
-void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg);
 struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 					     struct msi_domain_info *info,
 					     struct irq_domain *parent);
-- 
cgit v1.2.3


From ade044a3d0f0389e4f916337c505550acc3fd011 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:34 +0100
Subject: PCI/MSI: Remove msi_desc_to_pci_sysdata()

Last user is gone long ago.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20211206210224.210768199@linutronix.de
---
 include/linux/msi.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index d7b143a79cb4..ac6fec105edc 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -218,13 +218,8 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc,
 	for_each_msi_entry((desc), &(pdev)->dev)
 
 struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc);
-void *msi_desc_to_pci_sysdata(struct msi_desc *desc);
 void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg);
 #else /* CONFIG_PCI_MSI */
-static inline void *msi_desc_to_pci_sysdata(struct msi_desc *desc)
-{
-	return NULL;
-}
 static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
 {
 }
-- 
cgit v1.2.3


From e58f2259b91c02974c20db7b28d39d810a21249b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:39 +0100
Subject: genirq/msi, treewide: Use a named struct for PCI/MSI attributes

The unnamed struct sucks and is in the way of further cleanups. Stick the
PCI related MSI data into a real data structure and cleanup all users.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Kalle Valo <kvalo@codeaurora.org>
Link: https://lore.kernel.org/r/20211206210224.374863119@linutronix.de
---
 include/linux/msi.h | 84 ++++++++++++++++++++++++++---------------------------
 1 file changed, 41 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index ac6fec105edc..7e5c13f4e41b 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -68,6 +68,42 @@ static inline void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
 typedef void (*irq_write_msi_msg_t)(struct msi_desc *desc,
 				    struct msi_msg *msg);
 
+/**
+ * pci_msi_desc - PCI/MSI specific MSI descriptor data
+ *
+ * @msi_mask:	[PCI MSI]   MSI cached mask bits
+ * @msix_ctrl:	[PCI MSI-X] MSI-X cached per vector control bits
+ * @is_msix:	[PCI MSI/X] True if MSI-X
+ * @multiple:	[PCI MSI/X] log2 num of messages allocated
+ * @multi_cap:	[PCI MSI/X] log2 num of messages supported
+ * @can_mask:	[PCI MSI/X] Masking supported?
+ * @is_64:	[PCI MSI/X] Address size: 0=32bit 1=64bit
+ * @entry_nr:	[PCI MSI/X] Entry which is described by this descriptor
+ * @default_irq:[PCI MSI/X] The default pre-assigned non-MSI irq
+ * @mask_pos:	[PCI MSI]   Mask register position
+ * @mask_base:	[PCI MSI-X] Mask register base address
+ */
+struct pci_msi_desc {
+	union {
+		u32 msi_mask;
+		u32 msix_ctrl;
+	};
+	struct {
+		u8	is_msix		: 1;
+		u8	multiple	: 3;
+		u8	multi_cap	: 3;
+		u8	can_mask	: 1;
+		u8	is_64		: 1;
+		u8	is_virtual	: 1;
+		u16	entry_nr;
+		unsigned default_irq;
+	} msi_attrib;
+	union {
+		u8	mask_pos;
+		void __iomem *mask_base;
+	};
+};
+
 /**
  * platform_msi_desc - Platform device specific msi descriptor data
  * @msi_priv_data:	Pointer to platform private data
@@ -107,17 +143,7 @@ struct ti_sci_inta_msi_desc {
  *			address or data changes
  * @write_msi_msg_data:	Data parameter for the callback.
  *
- * @msi_mask:	[PCI MSI]   MSI cached mask bits
- * @msix_ctrl:	[PCI MSI-X] MSI-X cached per vector control bits
- * @is_msix:	[PCI MSI/X] True if MSI-X
- * @multiple:	[PCI MSI/X] log2 num of messages allocated
- * @multi_cap:	[PCI MSI/X] log2 num of messages supported
- * @maskbit:	[PCI MSI/X] Mask-Pending bit supported?
- * @is_64:	[PCI MSI/X] Address size: 0=32bit 1=64bit
- * @entry_nr:	[PCI MSI/X] Entry which is described by this descriptor
- * @default_irq:[PCI MSI/X] The default pre-assigned non-MSI irq
- * @mask_pos:	[PCI MSI]   Mask register position
- * @mask_base:	[PCI MSI-X] Mask register base address
+ * @pci:	[PCI]	    PCI speficic msi descriptor data
  * @platform:	[platform]  Platform device specific msi descriptor data
  * @fsl_mc:	[fsl-mc]    FSL MC device specific msi descriptor data
  * @inta:	[INTA]	    TISCI based INTA specific msi descriptor data
@@ -138,38 +164,10 @@ struct msi_desc {
 	void *write_msi_msg_data;
 
 	union {
-		/* PCI MSI/X specific data */
-		struct {
-			union {
-				u32 msi_mask;
-				u32 msix_ctrl;
-			};
-			struct {
-				u8	is_msix		: 1;
-				u8	multiple	: 3;
-				u8	multi_cap	: 3;
-				u8	can_mask	: 1;
-				u8	is_64		: 1;
-				u8	is_virtual	: 1;
-				u16	entry_nr;
-				unsigned default_irq;
-			} msi_attrib;
-			union {
-				u8	mask_pos;
-				void __iomem *mask_base;
-			};
-		};
-
-		/*
-		 * Non PCI variants add their data structure here. New
-		 * entries need to use a named structure. We want
-		 * proper name spaces for this. The PCI part is
-		 * anonymous for now as it would require an immediate
-		 * tree wide cleanup.
-		 */
-		struct platform_msi_desc platform;
-		struct fsl_mc_msi_desc fsl_mc;
-		struct ti_sci_inta_msi_desc inta;
+		struct pci_msi_desc		pci;
+		struct platform_msi_desc	platform;
+		struct fsl_mc_msi_desc		fsl_mc;
+		struct ti_sci_inta_msi_desc	inta;
 	};
 };
 
-- 
cgit v1.2.3


From ae72f3156729541581f526b85883ca53a20df2fa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:42 +0100
Subject: PCI/MSI: Make arch_restore_msi_irqs() less horrible.

Make arch_restore_msi_irqs() return a boolean which indicates whether the
core code should restore the MSI message or not. Get rid of the indirection
in x86.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>	# PCI
Link: https://lore.kernel.org/r/20211206210224.485668098@linutronix.de
---
 include/linux/msi.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 7e5c13f4e41b..673899fc24f6 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -273,11 +273,10 @@ static inline void arch_teardown_msi_irqs(struct pci_dev *dev)
 #endif
 
 /*
- * The restore hooks are still available as they are useful even
- * for fully irq domain based setups. Courtesy to XEN/X86.
+ * The restore hook is still available even for fully irq domain based
+ * setups. Courtesy to XEN/X86.
  */
-void arch_restore_msi_irqs(struct pci_dev *dev);
-void default_restore_msi_irqs(struct pci_dev *dev);
+bool arch_restore_msi_irqs(struct pci_dev *dev);
 
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
 
-- 
cgit v1.2.3


From aa423ac4221abdfb8588751e7838ca5f42f56db3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:52 +0100
Subject: PCI/MSI: Split out irqdomain code

Move the irqdomain specific code into its own file.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20211206210224.817754783@linutronix.de
---
 include/linux/msi.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 673899fc24f6..7ff7cf23b78d 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -259,17 +259,6 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
 void arch_teardown_msi_irq(unsigned int irq);
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void arch_teardown_msi_irqs(struct pci_dev *dev);
-#else
-static inline int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-	WARN_ON_ONCE(1);
-	return -ENODEV;
-}
-
-static inline void arch_teardown_msi_irqs(struct pci_dev *dev)
-{
-	WARN_ON_ONCE(1);
-}
 #endif
 
 /*
-- 
cgit v1.2.3


From 85aa607e79f8343f1ea028b29bdf8b6bc99c729a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:54 +0100
Subject: PCI/MSI: Sanitize MSI-X table map handling

Unmapping the MSI-X base mapping in the loops which allocate/free MSI
descriptors is daft and in the way of allowing runtime expansion of MSI-X
descriptors.

Store the mapping in struct pci_dev and free it after freeing the MSI-X
descriptors.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20211206210224.871651518@linutronix.de
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 18a75c8e615c..8cb103677f5a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -473,6 +473,7 @@ struct pci_dev {
 	u8		ptm_granularity;
 #endif
 #ifdef CONFIG_PCI_MSI
+	void __iomem	*msix_base;
 	const struct attribute_group **msi_irq_groups;
 #endif
 	struct pci_vpd	vpd;
-- 
cgit v1.2.3


From cd119b09a87d8beb50356d8c5c6aa42d89c44eb7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:56 +0100
Subject: PCI/MSI: Move msi_lock to struct pci_dev

It's only required for PCI/MSI. So no point in having it in every struct
device.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20211206210224.925241961@linutronix.de
---
 include/linux/device.h | 2 --
 include/linux/pci.h    | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index e270cb740b9e..2a22875238a6 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -407,7 +407,6 @@ struct dev_links_info {
  * @em_pd:	device's energy model performance domain
  * @pins:	For device pin management.
  *		See Documentation/driver-api/pin-control.rst for details.
- * @msi_lock:	Lock to protect MSI mask cache and mask register
  * @msi_list:	Hosts MSI descriptors
  * @msi_domain: The generic MSI domain this device is using.
  * @numa_node:	NUMA node this device is close to.
@@ -508,7 +507,6 @@ struct device {
 	struct dev_pin_info	*pins;
 #endif
 #ifdef CONFIG_GENERIC_MSI_IRQ
-	raw_spinlock_t		msi_lock;
 	struct list_head	msi_list;
 #endif
 #ifdef CONFIG_DMA_OPS
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8cb103677f5a..5cc46baef519 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -474,6 +474,7 @@ struct pci_dev {
 #endif
 #ifdef CONFIG_PCI_MSI
 	void __iomem	*msix_base;
+	raw_spinlock_t	msi_lock;
 	const struct attribute_group **msi_irq_groups;
 #endif
 	struct pci_vpd	vpd;
-- 
cgit v1.2.3


From 57ce3a3c99b21e9c4f951ef01e0a3603c987c259 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:57 +0100
Subject: PCI/MSI: Make pci_msi_domain_check_cap() static

No users outside of that file.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20211206210224.980989243@linutronix.de
---
 include/linux/msi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 7ff7cf23b78d..5248678e05d1 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -439,8 +439,6 @@ void *platform_msi_get_host_data(struct irq_domain *domain);
 struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 					     struct msi_domain_info *info,
 					     struct irq_domain *parent);
-int pci_msi_domain_check_cap(struct irq_domain *domain,
-			     struct msi_domain_info *info, struct device *dev);
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
 struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
 bool pci_dev_has_special_msi_domain(struct pci_dev *pdev);
-- 
cgit v1.2.3


From 890337624e1fa2da079fc1c036a62d178c985280 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:27:59 +0100
Subject: genirq/msi: Handle PCI/MSI allocation fail in core code

Get rid of yet another irqdomain callback and let the core code return the
already available information of how many descriptors could be allocated.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>	# PCI
Link: https://lore.kernel.org/r/20211206210225.046615302@linutronix.de
---
 include/linux/msi.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 5248678e05d1..ba4a39c430b5 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -286,7 +286,6 @@ struct msi_domain_info;
  * @msi_check:		Callback for verification of the domain/info/dev data
  * @msi_prepare:	Prepare the allocation of the interrupts in the domain
  * @set_desc:		Set the msi descriptor for an interrupt
- * @handle_error:	Optional error handler if the allocation fails
  * @domain_alloc_irqs:	Optional function to override the default allocation
  *			function.
  * @domain_free_irqs:	Optional function to override the default free
@@ -295,7 +294,7 @@ struct msi_domain_info;
  * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying
  * irqdomain.
  *
- * @msi_check, @msi_prepare, @handle_error and @set_desc are callbacks used by
+ * @msi_check, @msi_prepare and @set_desc are callbacks used by
  * msi_domain_alloc/free_irqs().
  *
  * @domain_alloc_irqs, @domain_free_irqs can be used to override the
@@ -332,8 +331,6 @@ struct msi_domain_ops {
 				       msi_alloc_info_t *arg);
 	void		(*set_desc)(msi_alloc_info_t *arg,
 				    struct msi_desc *desc);
-	int		(*handle_error)(struct irq_domain *domain,
-					struct msi_desc *desc, int error);
 	int		(*domain_alloc_irqs)(struct irq_domain *domain,
 					     struct device *dev, int nvec);
 	void		(*domain_free_irqs)(struct irq_domain *domain,
-- 
cgit v1.2.3


From 50468e4313355b161cac8a5155a45832995b7f25 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Tue, 16 Nov 2021 18:21:16 +0200
Subject: x86/sgx: Add an attribute for the amount of SGX memory in a NUMA node

== Problem ==

The amount of SGX memory on a system is determined by the BIOS and it
varies wildly between systems.  It can be as small as dozens of MB's
and as large as many GB's on servers.  Just like how applications need
to know how much regular RAM is available, enclave builders need to
know how much SGX memory an enclave can consume.

== Solution ==

Introduce a new sysfs file:

	/sys/devices/system/node/nodeX/x86/sgx_total_bytes

to enumerate the amount of SGX memory available in each NUMA node.
This serves the same function for SGX as /proc/meminfo or
/sys/devices/system/node/nodeX/meminfo does for normal RAM.

'sgx_total_bytes' is needed today to help drive the SGX selftests.
SGX-specific swap code is exercised by creating overcommitted enclaves
which are larger than the physical SGX memory on the system.  They
currently use a CPUID-based approach which can diverge from the actual
amount of SGX memory available.  'sgx_total_bytes' ensures that the
selftests can work efficiently and do not attempt stupid things like
creating a 100,000 MB enclave on a system with 128 MB of SGX memory.

== Implementation Details ==

Introduce CONFIG_HAVE_ARCH_NODE_DEV_GROUP opt-in flag to expose an
arch specific attribute group, and add an attribute for the amount of
SGX memory in bytes to each NUMA node:

== ABI Design Discussion ==

As opposed to the per-node ABI, a single, global ABI was considered.
However, this would prevent enclaves from being able to size
themselves so that they fit on a single NUMA node.  Essentially, a
single value would rule out NUMA optimizations for enclaves.

Create a new "x86/" directory inside each "nodeX/" sysfs directory.
'sgx_total_bytes' is expected to be the first of at least a few
sgx-specific files to be placed in the new directory.  Just scanning
/proc/meminfo, these are the no-brainers that we have for RAM, but we
need for SGX:

	MemTotal:       xxxx kB // sgx_total_bytes (implemented here)
	MemFree:        yyyy kB // sgx_free_bytes
	SwapTotal:      zzzz kB // sgx_swapped_bytes

So, at *least* three.  I think we will eventually end up needing
something more along the lines of a dozen.  A new directory (as
opposed to being in the nodeX/ "root") directory avoids cluttering the
root with several "sgx_*" files.

Place the new file in a new "nodeX/x86/" directory because SGX is
highly x86-specific.  It is very unlikely that any other architecture
(or even non-Intel x86 vendor) will ever implement SGX.  Using "sgx/"
as opposed to "x86/" was also considered.  But, there is a real chance
this can get used for other arch-specific purposes.

[ dhansen: rewrite changelog ]

Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211116162116.93081-2-jarkko@kernel.org
---
 include/linux/numa.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/numa.h b/include/linux/numa.h
index cb44cfe2b725..59df211d051f 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -58,4 +58,8 @@ static inline int phys_to_target_node(u64 start)
 }
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
+extern const struct attribute_group arch_node_dev_group;
+#endif
+
 #endif /* _LINUX_NUMA_H */
-- 
cgit v1.2.3


From 67b967ddd93d0ed57d392a00f6f90060f0910c0e Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 18 Nov 2021 12:46:59 +0100
Subject: mtd: Introduce an expert mode for forensics and debugging purposes

When developping NAND controller drivers or when debugging filesystem
corruptions, it is quite common to need hacking locally into the
MTD/NAND core in order to get access to the content of the bad
blocks. Instead of having multiple implementations out there let's
provide a simple yet effective specific MTD-wide debugfs entry to fully
disable these checks on purpose.

A warning is added to inform the user when this mode gets enabled.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20211118114659.1282855-1-miquel.raynal@bootlin.com
---
 include/linux/mtd/mtd.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index f5e7dfc2e4e9..1ffa933121f6 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -711,4 +711,7 @@ static inline int mtd_is_bitflip_or_eccerr(int err) {
 
 unsigned mtd_mmap_capabilities(struct mtd_info *mtd);
 
+extern char *mtd_expert_analysis_warning;
+extern bool mtd_expert_analysis_mode;
+
 #endif /* __MTD_MTD_H__ */
-- 
cgit v1.2.3


From cab2d3fd6866e089b5c50db09dece131f85bfebd Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Thu, 9 Dec 2021 18:46:33 +0530
Subject: bus: mhi: core: Add support for forced PM resume

For whatever reason, some devices like QCA6390, WCN6855 using ath11k
are not in M3 state during PM resume, but still functional. The
mhi_pm_resume should then not fail in those cases, and let the higher
level device specific stack continue resuming process.

Add an API mhi_pm_resume_force(), to force resuming irrespective of the
current MHI state. This fixes a regression with non functional ath11k WiFi
after suspend/resume cycle on some machines.

Bug report: https://bugzilla.kernel.org/show_bug.cgi?id=214179

Link: https://lore.kernel.org/regressions/871r5p0x2u.fsf@codeaurora.org/
Fixes: 020d3b26c07a ("bus: mhi: Early MHI resume failure in non M3 state")
Cc: stable@vger.kernel.org #5.13
Reported-by: Kalle Valo <kvalo@codeaurora.org>
Reported-by: Pengyu Ma <mapengyu@gmail.com>
Tested-by: Kalle Valo <kvalo@kernel.org>
Acked-by: Kalle Valo <kvalo@kernel.org>
Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
[mani: Switched to API, added bug report, reported-by tags and CCed stable]
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/20211209131633.4168-1-manivannan.sadhasivam@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mhi.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 723985879035..a5cc4cdf9cc8 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -663,6 +663,19 @@ int mhi_pm_suspend(struct mhi_controller *mhi_cntrl);
  */
 int mhi_pm_resume(struct mhi_controller *mhi_cntrl);
 
+/**
+ * mhi_pm_resume_force - Force resume MHI from suspended state
+ * @mhi_cntrl: MHI controller
+ *
+ * Resume the device irrespective of its MHI state. As per the MHI spec, devices
+ * has to be in M3 state during resume. But some devices seem to be in a
+ * different MHI state other than M3 but they continue working fine if allowed.
+ * This API is intented to be used for such devices.
+ *
+ * Return: 0 if the resume succeeds, a negative error code otherwise
+ */
+int mhi_pm_resume_force(struct mhi_controller *mhi_cntrl);
+
 /**
  * mhi_download_rddm_image - Download ramdump image from device for
  *                           debugging purpose.
-- 
cgit v1.2.3


From dc70ec217cec504e6f8fee8fd91bf5c118af05f2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sun, 21 Nov 2021 12:54:40 +0000
Subject: KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING

I'd like to make the build include dirty_ring.c based on whether the
arch wants it or not. That's a whole lot simpler if there's a config
symbol instead of doing it implicitly on KVM_DIRTY_LOG_PAGE_OFFSET
being set to something non-zero.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20211121125451.9489-2-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_dirty_ring.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
index 120e5e90fa1d..4da8d4a4140b 100644
--- a/include/linux/kvm_dirty_ring.h
+++ b/include/linux/kvm_dirty_ring.h
@@ -27,9 +27,9 @@ struct kvm_dirty_ring {
 	int index;
 };
 
-#if (KVM_DIRTY_LOG_PAGE_OFFSET == 0)
+#ifndef CONFIG_HAVE_KVM_DIRTY_RING
 /*
- * If KVM_DIRTY_LOG_PAGE_OFFSET not defined, kvm_dirty_ring.o should
+ * If CONFIG_HAVE_HVM_DIRTY_RING not defined, kvm_dirty_ring.o should
  * not be included as well, so define these nop functions for the arch.
  */
 static inline u32 kvm_dirty_ring_get_rsvd_entries(void)
@@ -74,7 +74,7 @@ static inline bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
 	return true;
 }
 
-#else /* KVM_DIRTY_LOG_PAGE_OFFSET == 0 */
+#else /* CONFIG_HAVE_KVM_DIRTY_RING */
 
 u32 kvm_dirty_ring_get_rsvd_entries(void);
 int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size);
@@ -98,6 +98,6 @@ struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset);
 void kvm_dirty_ring_free(struct kvm_dirty_ring *ring);
 bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring);
 
-#endif /* KVM_DIRTY_LOG_PAGE_OFFSET == 0 */
+#endif /* CONFIG_HAVE_KVM_DIRTY_RING */
 
 #endif	/* KVM_DIRTY_RING_H */
-- 
cgit v1.2.3


From 42288cb44c4b5fff7653bc392b583a2b8bd6a8c0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 8 Dec 2021 17:04:51 -0800
Subject: wait: add wake_up_pollfree()

Several ->poll() implementations are special in that they use a
waitqueue whose lifetime is the current task, rather than the struct
file as is normally the case.  This is okay for blocking polls, since a
blocking poll occurs within one task; however, non-blocking polls
require another solution.  This solution is for the queue to be cleared
before it is freed, using 'wake_up_poll(wq, EPOLLHUP | POLLFREE);'.

However, that has a bug: wake_up_poll() calls __wake_up() with
nr_exclusive=1.  Therefore, if there are multiple "exclusive" waiters,
and the wakeup function for the first one returns a positive value, only
that one will be called.  That's *not* what's needed for POLLFREE;
POLLFREE is special in that it really needs to wake up everyone.

Considering the three non-blocking poll systems:

- io_uring poll doesn't handle POLLFREE at all, so it is broken anyway.

- aio poll is unaffected, since it doesn't support exclusive waits.
  However, that's fragile, as someone could add this feature later.

- epoll doesn't appear to be broken by this, since its wakeup function
  returns 0 when it sees POLLFREE.  But this is fragile.

Although there is a workaround (see epoll), it's better to define a
function which always sends POLLFREE to all waiters.  Add such a
function.  Also make it verify that the queue really becomes empty after
all waiters have been woken up.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20211209010455.42744-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/wait.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2d0df57c9902..851e07da2583 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
+void __wake_up_pollfree(struct wait_queue_head *wq_head);
 
 #define wake_up(x)			__wake_up(x, TASK_NORMAL, 1, NULL)
 #define wake_up_nr(x, nr)		__wake_up(x, TASK_NORMAL, nr, NULL)
@@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
 #define wake_up_interruptible_sync_poll_locked(x, m)				\
 	__wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
 
+/**
+ * wake_up_pollfree - signal that a polled waitqueue is going away
+ * @wq_head: the wait queue head
+ *
+ * In the very rare cases where a ->poll() implementation uses a waitqueue whose
+ * lifetime is tied to a task rather than to the 'struct file' being polled,
+ * this function must be called before the waitqueue is freed so that
+ * non-blocking polls (e.g. epoll) are notified that the queue is going away.
+ *
+ * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
+ * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
+ */
+static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
+{
+	/*
+	 * For performance reasons, we don't always take the queue lock here.
+	 * Therefore, we might race with someone removing the last entry from
+	 * the queue, and proceed while they still hold the queue lock.
+	 * However, rcu_read_lock() is required to be held in such cases, so we
+	 * can safely proceed with an RCU-delayed free.
+	 */
+	if (waitqueue_active(wq_head))
+		__wake_up_pollfree(wq_head);
+}
+
 #define ___wait_cond_timeout(condition)						\
 ({										\
 	bool __cond = (condition);						\
-- 
cgit v1.2.3


From 6abfaaf124a81b7d2ab132cc2c9885baa14171e5 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2021 16:18:45 +0200
Subject: fs_parse: allow parameter value to be empty

Allow parameter value to be empty by specifying fs_param_can_be_empty
flag.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Link: https://lore.kernel.org/r/20211027141857.33657-2-lczerner@redhat.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/fs_parser.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h
index aab0ffc6bac6..f103c91139d4 100644
--- a/include/linux/fs_parser.h
+++ b/include/linux/fs_parser.h
@@ -42,7 +42,7 @@ struct fs_parameter_spec {
 	u8			opt;	/* Option number (returned by fs_parse()) */
 	unsigned short		flags;
 #define fs_param_neg_with_no	0x0002	/* "noxxx" is negative param */
-#define fs_param_neg_with_empty	0x0004	/* "xxx=" is negative param */
+#define fs_param_can_be_empty	0x0004	/* "xxx=" is allowed */
 #define fs_param_deprecated	0x0008	/* The param is deprecated */
 	const void		*data;
 };
-- 
cgit v1.2.3


From 3e5b1feccea7db576353ffc302f78d522e4116e6 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 9 Dec 2021 13:11:32 +0000
Subject: net: phylink: add legacy_pre_march2020 indicator

Add a boolean to phylink_config to indicate whether a driver has not
been updated for the changes in commit 7cceb599d15d ("net: phylink:
avoid mac_config calls"), and thus are reliant on the old behaviour.

We were currently keying the phylink behaviour on the presence of a
PCS, but this is sub-optimal for modern drivers that may not have a
PCS.

This commit merely introduces the new flag, but does not add any use,
since we need all legacy drivers to set this flag before it can be
used. Once these legacy drivers have been updated, we can remove this
flag.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phylink.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 01224235df0f..d005b8e36048 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -84,6 +84,8 @@ enum phylink_op_type {
  * struct phylink_config - PHYLINK configuration structure
  * @dev: a pointer to a struct device associated with the MAC
  * @type: operation type of PHYLINK instance
+ * @legacy_pre_march2020: driver has not been updated for March 2020 updates
+ *	(See commit 7cceb599d15d ("net: phylink: avoid mac_config calls")
  * @pcs_poll: MAC PCS cannot provide link change interrupt
  * @poll_fixed_state: if true, starts link_poll,
  *		      if MAC link is at %MLO_AN_FIXED mode.
@@ -97,6 +99,7 @@ enum phylink_op_type {
 struct phylink_config {
 	struct device *dev;
 	enum phylink_op_type type;
+	bool legacy_pre_march2020;
 	bool pcs_poll;
 	bool poll_fixed_state;
 	bool ovr_an_inband;
-- 
cgit v1.2.3


From 001f4261fe4d5ae710cf1f445b6cae6d9d3ae26e Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 9 Dec 2021 13:11:48 +0000
Subject: net: phylink: use legacy_pre_march2020

Use the legacy flag to indicate whether we should operate in legacy
mode. This allows us to stop using the presence of a PCS as an
indicator to the age of the phylink user, and make PCS presence
optional.

Legacy mode involves:
1) calling mac_config() whenever the link comes up
2) calling mac_config() whenever the inband advertisement changes,
   possibly followed by a call to mac_an_restart()
3) making use of mac_an_restart()
4) making use of mac_pcs_get_state()

All the above functionality was moved to a seperate "PCS" block of
operations in March 2020.

Update the documents to indicate that the differences that this flag
makes.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phylink.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index d005b8e36048..a2f266cc3442 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -190,6 +190,10 @@ void validate(struct phylink_config *config, unsigned long *supported,
  * negotiation completion state in @state->an_complete, and link up state
  * in @state->link. If possible, @state->lp_advertising should also be
  * populated.
+ *
+ * Note: This is a legacy method. This function will not be called unless
+ * legacy_pre_march2020 is set in &struct phylink_config and there is no
+ * PCS attached.
  */
 void mac_pcs_get_state(struct phylink_config *config,
 		       struct phylink_link_state *state);
@@ -230,6 +234,15 @@ int mac_prepare(struct phylink_config *config, unsigned int mode,
  * guaranteed to be correct, and so any mac_config() implementation must
  * never reference these fields.
  *
+ * Note: For legacy March 2020 drivers (drivers with legacy_pre_march2020 set
+ * in their &phylnk_config and which don't have a PCS), this function will be
+ * called on each link up event, and to also change the in-band advert. For
+ * non-legacy drivers, it will only be called to reconfigure the MAC for a
+ * "major" change in e.g. interface mode. It will not be called for changes
+ * in speed, duplex or pause modes or to change the in-band advertisement.
+ * In any case, it is strongly preferred that speed, duplex and pause settings
+ * are handled in the mac_link_up() method and not in this method.
+ *
  * (this requires a rewrite - please refer to mac_link_up() for situations
  *  where the PCS and MAC are not tightly integrated.)
  *
@@ -314,6 +327,10 @@ int mac_finish(struct phylink_config *config, unsigned int mode,
 /**
  * mac_an_restart() - restart 802.3z BaseX autonegotiation
  * @config: a pointer to a &struct phylink_config.
+ *
+ * Note: This is a legacy method. This function will not be called unless
+ * legacy_pre_march2020 is set in &struct phylink_config and there is no
+ * PCS attached.
  */
 void mac_an_restart(struct phylink_config *config);
 
-- 
cgit v1.2.3


From 1a2fb220edca98d18f90e3ef5bd6853a6b22b1b8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 6 Dec 2021 22:27:58 -0800
Subject: skbuff: Extract list pointers to silence compiler warnings

Under both -Warray-bounds and the object_size sanitizer, the compiler is
upset about accessing prev/next of sk_buff when the object it thinks it
is coming from is sk_buff_head. The warning is a false positive due to
the compiler taking a conservative approach, opting to warn at casting
time rather than access time.

However, in support of enabling -Warray-bounds globally (which has
found many real bugs), arrange things for sk_buff so that the compiler
can unambiguously see that there is no intention to access anything
except prev/next.  Introduce and cast to a separate struct sk_buff_list,
which contains _only_ the first two fields, silencing the warnings:

In file included from ./include/net/net_namespace.h:39,
                 from ./include/linux/netdevice.h:37,
                 from net/core/netpoll.c:17:
net/core/netpoll.c: In function 'refill_skbs':
./include/linux/skbuff.h:2086:9: warning: array subscript 'struct sk_buff[0]' is partly outside array bounds of 'struct sk_buff_head[1]' [-Warray-bounds]
 2086 |         __skb_insert(newsk, next->prev, next, list);
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
net/core/netpoll.c:49:28: note: while referencing 'skb_pool'
   49 | static struct sk_buff_head skb_pool;
      |                            ^~~~~~~~

This change results in no executable instruction differences.

Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20211207062758.2324338-1-keescook@chromium.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index dd262bd8ddbe..6535294f6a48 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -292,9 +292,11 @@ struct tc_skb_ext {
 #endif
 
 struct sk_buff_head {
-	/* These two members must be first. */
-	struct sk_buff	*next;
-	struct sk_buff	*prev;
+	/* These two members must be first to match sk_buff. */
+	struct_group_tagged(sk_buff_list, list,
+		struct sk_buff	*next;
+		struct sk_buff	*prev;
+	);
 
 	__u32		qlen;
 	spinlock_t	lock;
@@ -730,7 +732,7 @@ typedef unsigned char *sk_buff_data_t;
 struct sk_buff {
 	union {
 		struct {
-			/* These two members must be first. */
+			/* These two members must be first to match sk_buff_head. */
 			struct sk_buff		*next;
 			struct sk_buff		*prev;
 
@@ -1976,8 +1978,8 @@ static inline void __skb_insert(struct sk_buff *newsk,
 	 */
 	WRITE_ONCE(newsk->next, next);
 	WRITE_ONCE(newsk->prev, prev);
-	WRITE_ONCE(next->prev, newsk);
-	WRITE_ONCE(prev->next, newsk);
+	WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk);
+	WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk);
 	WRITE_ONCE(list->qlen, list->qlen + 1);
 }
 
@@ -2073,7 +2075,7 @@ static inline void __skb_queue_after(struct sk_buff_head *list,
 				     struct sk_buff *prev,
 				     struct sk_buff *newsk)
 {
-	__skb_insert(newsk, prev, prev->next, list);
+	__skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list);
 }
 
 void skb_append(struct sk_buff *old, struct sk_buff *newsk,
@@ -2083,7 +2085,7 @@ static inline void __skb_queue_before(struct sk_buff_head *list,
 				      struct sk_buff *next,
 				      struct sk_buff *newsk)
 {
-	__skb_insert(newsk, next->prev, next, list);
+	__skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list);
 }
 
 /**
-- 
cgit v1.2.3


From a4f1192cb53758a7210ed5a9ee695aeba22f75fb Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 9 Dec 2021 14:30:33 +0200
Subject: percpu_ref: Replace kernel.h with the necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 include/linux/percpu-refcount.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index b31d3f3312ce..d73a1c08c3e3 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -51,9 +51,9 @@
 #define _LINUX_PERCPU_REFCOUNT_H
 
 #include <linux/atomic.h>
-#include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/rcupdate.h>
+#include <linux/types.h>
 #include <linux/gfp.h>
 
 struct percpu_ref;
-- 
cgit v1.2.3


From 9756f64c8f2d19c0029a5827bda8ac275302ec22 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 30 Nov 2021 12:44:11 +0100
Subject: kcsan: Avoid checking scoped accesses from nested contexts

Avoid checking scoped accesses from nested contexts (such as nested
interrupts or in scheduler code) which share the same kcsan_ctx.

This is to avoid detecting false positive races of accesses in the same
thread with currently scoped accesses: consider setting up a watchpoint
for a non-scoped (normal) access that also "conflicts" with a current
scoped access. In a nested interrupt (or in the scheduler), which shares
the same kcsan_ctx, we cannot check scoped accesses set up in the parent
context -- simply ignore them in this case.

With the introduction of kcsan_ctx::disable_scoped, we can also clean up
kcsan_check_scoped_accesses()'s recursion guard, and do not need to
modify the list's prev pointer.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/kcsan.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h
index fc266ecb2a4d..13cef3458fed 100644
--- a/include/linux/kcsan.h
+++ b/include/linux/kcsan.h
@@ -21,6 +21,7 @@
  */
 struct kcsan_ctx {
 	int disable_count; /* disable counter */
+	int disable_scoped; /* disable scoped access counter */
 	int atomic_next; /* number of following atomic ops */
 
 	/*
-- 
cgit v1.2.3


From 69562e4983d93e2791c0bf128b07462afbd7f4dc Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 5 Aug 2021 14:57:45 +0200
Subject: kcsan: Add core support for a subset of weak memory modeling

Add support for modeling a subset of weak memory, which will enable
detection of a subset of data races due to missing memory barriers.

KCSAN's approach to detecting missing memory barriers is based on
modeling access reordering, and enabled if `CONFIG_KCSAN_WEAK_MEMORY=y`,
which depends on `CONFIG_KCSAN_STRICT=y`. The feature can be enabled or
disabled at boot and runtime via the `kcsan.weak_memory` boot parameter.

Each memory access for which a watchpoint is set up, is also selected
for simulated reordering within the scope of its function (at most 1
in-flight access).

We are limited to modeling the effects of "buffering" (delaying the
access), since the runtime cannot "prefetch" accesses (therefore no
acquire modeling). Once an access has been selected for reordering, it
is checked along every other access until the end of the function scope.
If an appropriate memory barrier is encountered, the access will no
longer be considered for reordering.

When the result of a memory operation should be ordered by a barrier,
KCSAN can then detect data races where the conflict only occurs as a
result of a missing barrier due to reordering accesses.

Suggested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/kcsan-checks.h | 10 +++++++++-
 include/linux/kcsan.h        | 10 +++++++++-
 include/linux/sched.h        |  3 +++
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h
index 5f5965246877..a1c6a89fde71 100644
--- a/include/linux/kcsan-checks.h
+++ b/include/linux/kcsan-checks.h
@@ -99,7 +99,15 @@ void kcsan_set_access_mask(unsigned long mask);
 
 /* Scoped access information. */
 struct kcsan_scoped_access {
-	struct list_head list;
+	union {
+		struct list_head list; /* scoped_accesses list */
+		/*
+		 * Not an entry in scoped_accesses list; stack depth from where
+		 * the access was initialized.
+		 */
+		int stack_depth;
+	};
+
 	/* Access information. */
 	const volatile void *ptr;
 	size_t size;
diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h
index 13cef3458fed..c07c71f5ba4f 100644
--- a/include/linux/kcsan.h
+++ b/include/linux/kcsan.h
@@ -49,8 +49,16 @@ struct kcsan_ctx {
 	 */
 	unsigned long access_mask;
 
-	/* List of scoped accesses. */
+	/* List of scoped accesses; likely to be empty. */
 	struct list_head scoped_accesses;
+
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+	/*
+	 * Scoped access for modeling access reordering to detect missing memory
+	 * barriers; only keep 1 to keep fast-path complexity manageable.
+	 */
+	struct kcsan_scoped_access reorder_access;
+#endif
 };
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78c351e35fec..0cd40b010487 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1339,6 +1339,9 @@ struct task_struct {
 #ifdef CONFIG_TRACE_IRQFLAGS
 	struct irqtrace_events		kcsan_save_irqtrace;
 #endif
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+	int				kcsan_stack_depth;
+#endif
 #endif
 
 #if IS_ENABLED(CONFIG_KUNIT)
-- 
cgit v1.2.3


From 0b8b0830ac1419d7250fde31ea78793a03f3db44 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 30 Nov 2021 12:44:13 +0100
Subject: kcsan: Add core memory barrier instrumentation functions

Add the core memory barrier instrumentation functions. These invalidate
the current in-flight reordered access based on the rules for the
respective barrier types and in-flight access type.

To obtain barrier instrumentation that can be disabled via __no_kcsan
with appropriate compiler-support (and not just with objtool help),
barrier instrumentation repurposes __atomic_signal_fence(), instead of
inserting explicit calls. Crucially, __atomic_signal_fence() normally
does not map to any real instructions, but is still intercepted by
fsanitize=thread. As a result, like any other instrumentation done by
the compiler, barrier instrumentation can be disabled with __no_kcsan.

Unfortunately Clang and GCC currently differ in their __no_kcsan aka
__no_sanitize_thread behaviour with respect to builtin atomics (and
__tsan_func_{entry,exit}) instrumentation. This is already reflected in
Kconfig.kcsan's dependencies for KCSAN_WEAK_MEMORY. A later change will
introduce support for newer versions of Clang that can implement
__no_kcsan to also remove the additional instrumentation introduced by
KCSAN_WEAK_MEMORY.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/kcsan-checks.h | 71 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h
index a1c6a89fde71..9d2c869167f2 100644
--- a/include/linux/kcsan-checks.h
+++ b/include/linux/kcsan-checks.h
@@ -36,6 +36,36 @@
  */
 void __kcsan_check_access(const volatile void *ptr, size_t size, int type);
 
+/*
+ * See definition of __tsan_atomic_signal_fence() in kernel/kcsan/core.c.
+ * Note: The mappings are arbitrary, and do not reflect any real mappings of C11
+ * memory orders to the LKMM memory orders and vice-versa!
+ */
+#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb	__ATOMIC_SEQ_CST
+#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb	__ATOMIC_ACQ_REL
+#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb	__ATOMIC_ACQUIRE
+#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_release	__ATOMIC_RELEASE
+
+/**
+ * __kcsan_mb - full memory barrier instrumentation
+ */
+void __kcsan_mb(void);
+
+/**
+ * __kcsan_wmb - write memory barrier instrumentation
+ */
+void __kcsan_wmb(void);
+
+/**
+ * __kcsan_rmb - read memory barrier instrumentation
+ */
+void __kcsan_rmb(void);
+
+/**
+ * __kcsan_release - release barrier instrumentation
+ */
+void __kcsan_release(void);
+
 /**
  * kcsan_disable_current - disable KCSAN for the current context
  *
@@ -159,6 +189,10 @@ void kcsan_end_scoped_access(struct kcsan_scoped_access *sa);
 static inline void __kcsan_check_access(const volatile void *ptr, size_t size,
 					int type) { }
 
+static inline void __kcsan_mb(void)			{ }
+static inline void __kcsan_wmb(void)			{ }
+static inline void __kcsan_rmb(void)			{ }
+static inline void __kcsan_release(void)		{ }
 static inline void kcsan_disable_current(void)		{ }
 static inline void kcsan_enable_current(void)		{ }
 static inline void kcsan_enable_current_nowarn(void)	{ }
@@ -191,12 +225,45 @@ static inline void kcsan_end_scoped_access(struct kcsan_scoped_access *sa) { }
  */
 #define __kcsan_disable_current kcsan_disable_current
 #define __kcsan_enable_current kcsan_enable_current_nowarn
-#else
+#else /* __SANITIZE_THREAD__ */
 static inline void kcsan_check_access(const volatile void *ptr, size_t size,
 				      int type) { }
 static inline void __kcsan_enable_current(void)  { }
 static inline void __kcsan_disable_current(void) { }
-#endif
+#endif /* __SANITIZE_THREAD__ */
+
+#if defined(CONFIG_KCSAN_WEAK_MEMORY) && defined(__SANITIZE_THREAD__)
+/*
+ * Normal barrier instrumentation is not done via explicit calls, but by mapping
+ * to a repurposed __atomic_signal_fence(), which normally does not generate any
+ * real instructions, but is still intercepted by fsanitize=thread. This means,
+ * like any other compile-time instrumentation, barrier instrumentation can be
+ * disabled with the __no_kcsan function attribute.
+ *
+ * Also see definition of __tsan_atomic_signal_fence() in kernel/kcsan/core.c.
+ */
+#define __KCSAN_BARRIER_TO_SIGNAL_FENCE(name)					\
+	static __always_inline void kcsan_##name(void)				\
+	{									\
+		barrier();							\
+		__atomic_signal_fence(__KCSAN_BARRIER_TO_SIGNAL_FENCE_##name);	\
+		barrier();							\
+	}
+__KCSAN_BARRIER_TO_SIGNAL_FENCE(mb)
+__KCSAN_BARRIER_TO_SIGNAL_FENCE(wmb)
+__KCSAN_BARRIER_TO_SIGNAL_FENCE(rmb)
+__KCSAN_BARRIER_TO_SIGNAL_FENCE(release)
+#elif defined(CONFIG_KCSAN_WEAK_MEMORY) && defined(__KCSAN_INSTRUMENT_BARRIERS__)
+#define kcsan_mb	__kcsan_mb
+#define kcsan_wmb	__kcsan_wmb
+#define kcsan_rmb	__kcsan_rmb
+#define kcsan_release	__kcsan_release
+#else /* CONFIG_KCSAN_WEAK_MEMORY && ... */
+static inline void kcsan_mb(void)		{ }
+static inline void kcsan_wmb(void)		{ }
+static inline void kcsan_rmb(void)		{ }
+static inline void kcsan_release(void)		{ }
+#endif /* CONFIG_KCSAN_WEAK_MEMORY && ... */
 
 /**
  * __kcsan_check_read - check regular read access for races
-- 
cgit v1.2.3


From f948666de517cf8ebef7cb2c9b2d669dec4bfe2e Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 30 Nov 2021 12:44:22 +0100
Subject: locking/barriers, kcsan: Add instrumentation for barriers

Adds the required KCSAN instrumentation for barriers if CONFIG_SMP.
KCSAN supports modeling the effects of:

	smp_mb()
	smp_rmb()
	smp_wmb()
	smp_store_release()

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/spinlock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index b4e5ca23f840..5c0c5174155d 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -171,7 +171,7 @@ do {									\
  * Architectures that can implement ACQUIRE better need to take care.
  */
 #ifndef smp_mb__after_spinlock
-#define smp_mb__after_spinlock()	do { } while (0)
+#define smp_mb__after_spinlock()	kcsan_mb()
 #endif
 
 #ifdef CONFIG_DEBUG_SPINLOCK
-- 
cgit v1.2.3


From e87c4f6642f49627c3430cb3ee78c73fb51b48e4 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 30 Nov 2021 12:44:24 +0100
Subject: locking/atomics, kcsan: Add instrumentation for barriers

Adds the required KCSAN instrumentation for barriers of atomics.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/atomic/atomic-instrumented.h | 135 ++++++++++++++++++++++++++++-
 1 file changed, 134 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index a0f654370da3..5d69b143c28e 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -45,6 +45,7 @@ atomic_set(atomic_t *v, int i)
 static __always_inline void
 atomic_set_release(atomic_t *v, int i)
 {
+	kcsan_release();
 	instrument_atomic_write(v, sizeof(*v));
 	arch_atomic_set_release(v, i);
 }
@@ -59,6 +60,7 @@ atomic_add(int i, atomic_t *v)
 static __always_inline int
 atomic_add_return(int i, atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_add_return(i, v);
 }
@@ -73,6 +75,7 @@ atomic_add_return_acquire(int i, atomic_t *v)
 static __always_inline int
 atomic_add_return_release(int i, atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_add_return_release(i, v);
 }
@@ -87,6 +90,7 @@ atomic_add_return_relaxed(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_add(int i, atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_add(i, v);
 }
@@ -101,6 +105,7 @@ atomic_fetch_add_acquire(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_add_release(int i, atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_add_release(i, v);
 }
@@ -122,6 +127,7 @@ atomic_sub(int i, atomic_t *v)
 static __always_inline int
 atomic_sub_return(int i, atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_sub_return(i, v);
 }
@@ -136,6 +142,7 @@ atomic_sub_return_acquire(int i, atomic_t *v)
 static __always_inline int
 atomic_sub_return_release(int i, atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_sub_return_release(i, v);
 }
@@ -150,6 +157,7 @@ atomic_sub_return_relaxed(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_sub(int i, atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_sub(i, v);
 }
@@ -164,6 +172,7 @@ atomic_fetch_sub_acquire(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_sub_release(int i, atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_sub_release(i, v);
 }
@@ -185,6 +194,7 @@ atomic_inc(atomic_t *v)
 static __always_inline int
 atomic_inc_return(atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_inc_return(v);
 }
@@ -199,6 +209,7 @@ atomic_inc_return_acquire(atomic_t *v)
 static __always_inline int
 atomic_inc_return_release(atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_inc_return_release(v);
 }
@@ -213,6 +224,7 @@ atomic_inc_return_relaxed(atomic_t *v)
 static __always_inline int
 atomic_fetch_inc(atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_inc(v);
 }
@@ -227,6 +239,7 @@ atomic_fetch_inc_acquire(atomic_t *v)
 static __always_inline int
 atomic_fetch_inc_release(atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_inc_release(v);
 }
@@ -248,6 +261,7 @@ atomic_dec(atomic_t *v)
 static __always_inline int
 atomic_dec_return(atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_dec_return(v);
 }
@@ -262,6 +276,7 @@ atomic_dec_return_acquire(atomic_t *v)
 static __always_inline int
 atomic_dec_return_release(atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_dec_return_release(v);
 }
@@ -276,6 +291,7 @@ atomic_dec_return_relaxed(atomic_t *v)
 static __always_inline int
 atomic_fetch_dec(atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_dec(v);
 }
@@ -290,6 +306,7 @@ atomic_fetch_dec_acquire(atomic_t *v)
 static __always_inline int
 atomic_fetch_dec_release(atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_dec_release(v);
 }
@@ -311,6 +328,7 @@ atomic_and(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_and(int i, atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_and(i, v);
 }
@@ -325,6 +343,7 @@ atomic_fetch_and_acquire(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_and_release(int i, atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_and_release(i, v);
 }
@@ -346,6 +365,7 @@ atomic_andnot(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_andnot(int i, atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_andnot(i, v);
 }
@@ -360,6 +380,7 @@ atomic_fetch_andnot_acquire(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_andnot_release(int i, atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_andnot_release(i, v);
 }
@@ -381,6 +402,7 @@ atomic_or(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_or(int i, atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_or(i, v);
 }
@@ -395,6 +417,7 @@ atomic_fetch_or_acquire(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_or_release(int i, atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_or_release(i, v);
 }
@@ -416,6 +439,7 @@ atomic_xor(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_xor(int i, atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_xor(i, v);
 }
@@ -430,6 +454,7 @@ atomic_fetch_xor_acquire(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_xor_release(int i, atomic_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_xor_release(i, v);
 }
@@ -444,6 +469,7 @@ atomic_fetch_xor_relaxed(int i, atomic_t *v)
 static __always_inline int
 atomic_xchg(atomic_t *v, int i)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_xchg(v, i);
 }
@@ -458,6 +484,7 @@ atomic_xchg_acquire(atomic_t *v, int i)
 static __always_inline int
 atomic_xchg_release(atomic_t *v, int i)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_xchg_release(v, i);
 }
@@ -472,6 +499,7 @@ atomic_xchg_relaxed(atomic_t *v, int i)
 static __always_inline int
 atomic_cmpxchg(atomic_t *v, int old, int new)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_cmpxchg(v, old, new);
 }
@@ -486,6 +514,7 @@ atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
 static __always_inline int
 atomic_cmpxchg_release(atomic_t *v, int old, int new)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_cmpxchg_release(v, old, new);
 }
@@ -500,6 +529,7 @@ atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
 static __always_inline bool
 atomic_try_cmpxchg(atomic_t *v, int *old, int new)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	instrument_atomic_read_write(old, sizeof(*old));
 	return arch_atomic_try_cmpxchg(v, old, new);
@@ -516,6 +546,7 @@ atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
 static __always_inline bool
 atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	instrument_atomic_read_write(old, sizeof(*old));
 	return arch_atomic_try_cmpxchg_release(v, old, new);
@@ -532,6 +563,7 @@ atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
 static __always_inline bool
 atomic_sub_and_test(int i, atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_sub_and_test(i, v);
 }
@@ -539,6 +571,7 @@ atomic_sub_and_test(int i, atomic_t *v)
 static __always_inline bool
 atomic_dec_and_test(atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_dec_and_test(v);
 }
@@ -546,6 +579,7 @@ atomic_dec_and_test(atomic_t *v)
 static __always_inline bool
 atomic_inc_and_test(atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_inc_and_test(v);
 }
@@ -553,6 +587,7 @@ atomic_inc_and_test(atomic_t *v)
 static __always_inline bool
 atomic_add_negative(int i, atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_add_negative(i, v);
 }
@@ -560,6 +595,7 @@ atomic_add_negative(int i, atomic_t *v)
 static __always_inline int
 atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_fetch_add_unless(v, a, u);
 }
@@ -567,6 +603,7 @@ atomic_fetch_add_unless(atomic_t *v, int a, int u)
 static __always_inline bool
 atomic_add_unless(atomic_t *v, int a, int u)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_add_unless(v, a, u);
 }
@@ -574,6 +611,7 @@ atomic_add_unless(atomic_t *v, int a, int u)
 static __always_inline bool
 atomic_inc_not_zero(atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_inc_not_zero(v);
 }
@@ -581,6 +619,7 @@ atomic_inc_not_zero(atomic_t *v)
 static __always_inline bool
 atomic_inc_unless_negative(atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_inc_unless_negative(v);
 }
@@ -588,6 +627,7 @@ atomic_inc_unless_negative(atomic_t *v)
 static __always_inline bool
 atomic_dec_unless_positive(atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_dec_unless_positive(v);
 }
@@ -595,6 +635,7 @@ atomic_dec_unless_positive(atomic_t *v)
 static __always_inline int
 atomic_dec_if_positive(atomic_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_dec_if_positive(v);
 }
@@ -623,6 +664,7 @@ atomic64_set(atomic64_t *v, s64 i)
 static __always_inline void
 atomic64_set_release(atomic64_t *v, s64 i)
 {
+	kcsan_release();
 	instrument_atomic_write(v, sizeof(*v));
 	arch_atomic64_set_release(v, i);
 }
@@ -637,6 +679,7 @@ atomic64_add(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_add_return(s64 i, atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_add_return(i, v);
 }
@@ -651,6 +694,7 @@ atomic64_add_return_acquire(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_add_return_release(s64 i, atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_add_return_release(i, v);
 }
@@ -665,6 +709,7 @@ atomic64_add_return_relaxed(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_add(s64 i, atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_add(i, v);
 }
@@ -679,6 +724,7 @@ atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_add_release(s64 i, atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_add_release(i, v);
 }
@@ -700,6 +746,7 @@ atomic64_sub(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_sub_return(s64 i, atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_sub_return(i, v);
 }
@@ -714,6 +761,7 @@ atomic64_sub_return_acquire(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_sub_return_release(s64 i, atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_sub_return_release(i, v);
 }
@@ -728,6 +776,7 @@ atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_sub(s64 i, atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_sub(i, v);
 }
@@ -742,6 +791,7 @@ atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_sub_release(s64 i, atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_sub_release(i, v);
 }
@@ -763,6 +813,7 @@ atomic64_inc(atomic64_t *v)
 static __always_inline s64
 atomic64_inc_return(atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_inc_return(v);
 }
@@ -777,6 +828,7 @@ atomic64_inc_return_acquire(atomic64_t *v)
 static __always_inline s64
 atomic64_inc_return_release(atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_inc_return_release(v);
 }
@@ -791,6 +843,7 @@ atomic64_inc_return_relaxed(atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_inc(atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_inc(v);
 }
@@ -805,6 +858,7 @@ atomic64_fetch_inc_acquire(atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_inc_release(atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_inc_release(v);
 }
@@ -826,6 +880,7 @@ atomic64_dec(atomic64_t *v)
 static __always_inline s64
 atomic64_dec_return(atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_dec_return(v);
 }
@@ -840,6 +895,7 @@ atomic64_dec_return_acquire(atomic64_t *v)
 static __always_inline s64
 atomic64_dec_return_release(atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_dec_return_release(v);
 }
@@ -854,6 +910,7 @@ atomic64_dec_return_relaxed(atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_dec(atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_dec(v);
 }
@@ -868,6 +925,7 @@ atomic64_fetch_dec_acquire(atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_dec_release(atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_dec_release(v);
 }
@@ -889,6 +947,7 @@ atomic64_and(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_and(s64 i, atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_and(i, v);
 }
@@ -903,6 +962,7 @@ atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_and_release(s64 i, atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_and_release(i, v);
 }
@@ -924,6 +984,7 @@ atomic64_andnot(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_andnot(s64 i, atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_andnot(i, v);
 }
@@ -938,6 +999,7 @@ atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_andnot_release(i, v);
 }
@@ -959,6 +1021,7 @@ atomic64_or(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_or(s64 i, atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_or(i, v);
 }
@@ -973,6 +1036,7 @@ atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_or_release(s64 i, atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_or_release(i, v);
 }
@@ -994,6 +1058,7 @@ atomic64_xor(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_xor(s64 i, atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_xor(i, v);
 }
@@ -1008,6 +1073,7 @@ atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_xor_release(s64 i, atomic64_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_xor_release(i, v);
 }
@@ -1022,6 +1088,7 @@ atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_xchg(atomic64_t *v, s64 i)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_xchg(v, i);
 }
@@ -1036,6 +1103,7 @@ atomic64_xchg_acquire(atomic64_t *v, s64 i)
 static __always_inline s64
 atomic64_xchg_release(atomic64_t *v, s64 i)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_xchg_release(v, i);
 }
@@ -1050,6 +1118,7 @@ atomic64_xchg_relaxed(atomic64_t *v, s64 i)
 static __always_inline s64
 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_cmpxchg(v, old, new);
 }
@@ -1064,6 +1133,7 @@ atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
 static __always_inline s64
 atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_cmpxchg_release(v, old, new);
 }
@@ -1078,6 +1148,7 @@ atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
 static __always_inline bool
 atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	instrument_atomic_read_write(old, sizeof(*old));
 	return arch_atomic64_try_cmpxchg(v, old, new);
@@ -1094,6 +1165,7 @@ atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
 static __always_inline bool
 atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	instrument_atomic_read_write(old, sizeof(*old));
 	return arch_atomic64_try_cmpxchg_release(v, old, new);
@@ -1110,6 +1182,7 @@ atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
 static __always_inline bool
 atomic64_sub_and_test(s64 i, atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_sub_and_test(i, v);
 }
@@ -1117,6 +1190,7 @@ atomic64_sub_and_test(s64 i, atomic64_t *v)
 static __always_inline bool
 atomic64_dec_and_test(atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_dec_and_test(v);
 }
@@ -1124,6 +1198,7 @@ atomic64_dec_and_test(atomic64_t *v)
 static __always_inline bool
 atomic64_inc_and_test(atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_inc_and_test(v);
 }
@@ -1131,6 +1206,7 @@ atomic64_inc_and_test(atomic64_t *v)
 static __always_inline bool
 atomic64_add_negative(s64 i, atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_add_negative(i, v);
 }
@@ -1138,6 +1214,7 @@ atomic64_add_negative(s64 i, atomic64_t *v)
 static __always_inline s64
 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_fetch_add_unless(v, a, u);
 }
@@ -1145,6 +1222,7 @@ atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
 static __always_inline bool
 atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_add_unless(v, a, u);
 }
@@ -1152,6 +1230,7 @@ atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
 static __always_inline bool
 atomic64_inc_not_zero(atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_inc_not_zero(v);
 }
@@ -1159,6 +1238,7 @@ atomic64_inc_not_zero(atomic64_t *v)
 static __always_inline bool
 atomic64_inc_unless_negative(atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_inc_unless_negative(v);
 }
@@ -1166,6 +1246,7 @@ atomic64_inc_unless_negative(atomic64_t *v)
 static __always_inline bool
 atomic64_dec_unless_positive(atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_dec_unless_positive(v);
 }
@@ -1173,6 +1254,7 @@ atomic64_dec_unless_positive(atomic64_t *v)
 static __always_inline s64
 atomic64_dec_if_positive(atomic64_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic64_dec_if_positive(v);
 }
@@ -1201,6 +1283,7 @@ atomic_long_set(atomic_long_t *v, long i)
 static __always_inline void
 atomic_long_set_release(atomic_long_t *v, long i)
 {
+	kcsan_release();
 	instrument_atomic_write(v, sizeof(*v));
 	arch_atomic_long_set_release(v, i);
 }
@@ -1215,6 +1298,7 @@ atomic_long_add(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_add_return(long i, atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_add_return(i, v);
 }
@@ -1229,6 +1313,7 @@ atomic_long_add_return_acquire(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_add_return_release(long i, atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_add_return_release(i, v);
 }
@@ -1243,6 +1328,7 @@ atomic_long_add_return_relaxed(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_add(long i, atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_add(i, v);
 }
@@ -1257,6 +1343,7 @@ atomic_long_fetch_add_acquire(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_add_release(long i, atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_add_release(i, v);
 }
@@ -1278,6 +1365,7 @@ atomic_long_sub(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_sub_return(long i, atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_sub_return(i, v);
 }
@@ -1292,6 +1380,7 @@ atomic_long_sub_return_acquire(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_sub_return_release(long i, atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_sub_return_release(i, v);
 }
@@ -1306,6 +1395,7 @@ atomic_long_sub_return_relaxed(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_sub(long i, atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_sub(i, v);
 }
@@ -1320,6 +1410,7 @@ atomic_long_fetch_sub_acquire(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_sub_release(long i, atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_sub_release(i, v);
 }
@@ -1341,6 +1432,7 @@ atomic_long_inc(atomic_long_t *v)
 static __always_inline long
 atomic_long_inc_return(atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_inc_return(v);
 }
@@ -1355,6 +1447,7 @@ atomic_long_inc_return_acquire(atomic_long_t *v)
 static __always_inline long
 atomic_long_inc_return_release(atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_inc_return_release(v);
 }
@@ -1369,6 +1462,7 @@ atomic_long_inc_return_relaxed(atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_inc(atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_inc(v);
 }
@@ -1383,6 +1477,7 @@ atomic_long_fetch_inc_acquire(atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_inc_release(atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_inc_release(v);
 }
@@ -1404,6 +1499,7 @@ atomic_long_dec(atomic_long_t *v)
 static __always_inline long
 atomic_long_dec_return(atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_dec_return(v);
 }
@@ -1418,6 +1514,7 @@ atomic_long_dec_return_acquire(atomic_long_t *v)
 static __always_inline long
 atomic_long_dec_return_release(atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_dec_return_release(v);
 }
@@ -1432,6 +1529,7 @@ atomic_long_dec_return_relaxed(atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_dec(atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_dec(v);
 }
@@ -1446,6 +1544,7 @@ atomic_long_fetch_dec_acquire(atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_dec_release(atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_dec_release(v);
 }
@@ -1467,6 +1566,7 @@ atomic_long_and(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_and(long i, atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_and(i, v);
 }
@@ -1481,6 +1581,7 @@ atomic_long_fetch_and_acquire(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_and_release(long i, atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_and_release(i, v);
 }
@@ -1502,6 +1603,7 @@ atomic_long_andnot(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_andnot(long i, atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_andnot(i, v);
 }
@@ -1516,6 +1618,7 @@ atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_andnot_release(long i, atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_andnot_release(i, v);
 }
@@ -1537,6 +1640,7 @@ atomic_long_or(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_or(long i, atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_or(i, v);
 }
@@ -1551,6 +1655,7 @@ atomic_long_fetch_or_acquire(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_or_release(long i, atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_or_release(i, v);
 }
@@ -1572,6 +1677,7 @@ atomic_long_xor(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_xor(long i, atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_xor(i, v);
 }
@@ -1586,6 +1692,7 @@ atomic_long_fetch_xor_acquire(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_xor_release(long i, atomic_long_t *v)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_xor_release(i, v);
 }
@@ -1600,6 +1707,7 @@ atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_xchg(atomic_long_t *v, long i)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_xchg(v, i);
 }
@@ -1614,6 +1722,7 @@ atomic_long_xchg_acquire(atomic_long_t *v, long i)
 static __always_inline long
 atomic_long_xchg_release(atomic_long_t *v, long i)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_xchg_release(v, i);
 }
@@ -1628,6 +1737,7 @@ atomic_long_xchg_relaxed(atomic_long_t *v, long i)
 static __always_inline long
 atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_cmpxchg(v, old, new);
 }
@@ -1642,6 +1752,7 @@ atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
 static __always_inline long
 atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_cmpxchg_release(v, old, new);
 }
@@ -1656,6 +1767,7 @@ atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
 static __always_inline bool
 atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	instrument_atomic_read_write(old, sizeof(*old));
 	return arch_atomic_long_try_cmpxchg(v, old, new);
@@ -1672,6 +1784,7 @@ atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
 static __always_inline bool
 atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
 {
+	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
 	instrument_atomic_read_write(old, sizeof(*old));
 	return arch_atomic_long_try_cmpxchg_release(v, old, new);
@@ -1688,6 +1801,7 @@ atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
 static __always_inline bool
 atomic_long_sub_and_test(long i, atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_sub_and_test(i, v);
 }
@@ -1695,6 +1809,7 @@ atomic_long_sub_and_test(long i, atomic_long_t *v)
 static __always_inline bool
 atomic_long_dec_and_test(atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_dec_and_test(v);
 }
@@ -1702,6 +1817,7 @@ atomic_long_dec_and_test(atomic_long_t *v)
 static __always_inline bool
 atomic_long_inc_and_test(atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_inc_and_test(v);
 }
@@ -1709,6 +1825,7 @@ atomic_long_inc_and_test(atomic_long_t *v)
 static __always_inline bool
 atomic_long_add_negative(long i, atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_add_negative(i, v);
 }
@@ -1716,6 +1833,7 @@ atomic_long_add_negative(long i, atomic_long_t *v)
 static __always_inline long
 atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_fetch_add_unless(v, a, u);
 }
@@ -1723,6 +1841,7 @@ atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
 static __always_inline bool
 atomic_long_add_unless(atomic_long_t *v, long a, long u)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_add_unless(v, a, u);
 }
@@ -1730,6 +1849,7 @@ atomic_long_add_unless(atomic_long_t *v, long a, long u)
 static __always_inline bool
 atomic_long_inc_not_zero(atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_inc_not_zero(v);
 }
@@ -1737,6 +1857,7 @@ atomic_long_inc_not_zero(atomic_long_t *v)
 static __always_inline bool
 atomic_long_inc_unless_negative(atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_inc_unless_negative(v);
 }
@@ -1744,6 +1865,7 @@ atomic_long_inc_unless_negative(atomic_long_t *v)
 static __always_inline bool
 atomic_long_dec_unless_positive(atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_dec_unless_positive(v);
 }
@@ -1751,6 +1873,7 @@ atomic_long_dec_unless_positive(atomic_long_t *v)
 static __always_inline long
 atomic_long_dec_if_positive(atomic_long_t *v)
 {
+	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
 	return arch_atomic_long_dec_if_positive(v);
 }
@@ -1758,6 +1881,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 #define xchg(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_mb(); \
 	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_xchg(__ai_ptr, __VA_ARGS__); \
 })
@@ -1772,6 +1896,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 #define xchg_release(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_release(); \
 	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_xchg_release(__ai_ptr, __VA_ARGS__); \
 })
@@ -1786,6 +1911,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 #define cmpxchg(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_mb(); \
 	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg(__ai_ptr, __VA_ARGS__); \
 })
@@ -1800,6 +1926,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 #define cmpxchg_release(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_release(); \
 	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg_release(__ai_ptr, __VA_ARGS__); \
 })
@@ -1814,6 +1941,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 #define cmpxchg64(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_mb(); \
 	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg64(__ai_ptr, __VA_ARGS__); \
 })
@@ -1828,6 +1956,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 #define cmpxchg64_release(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_release(); \
 	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \
 })
@@ -1843,6 +1972,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
+	kcsan_mb(); \
 	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \
@@ -1861,6 +1991,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
+	kcsan_release(); \
 	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
@@ -1892,6 +2023,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 #define sync_cmpxchg(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_mb(); \
 	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \
 })
@@ -1899,6 +2031,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 #define cmpxchg_double(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_mb(); \
 	instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \
 	arch_cmpxchg_double(__ai_ptr, __VA_ARGS__); \
 })
@@ -1912,4 +2045,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 })
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 2a9553f0a9d5619f19151092df5cabbbf16ce835
+// 87c974b93032afd42143613434d1a7788fa598f9
-- 
cgit v1.2.3


From a015b7085979b12e55f67f3b86be0321fff6be3f Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Tue, 30 Nov 2021 12:44:32 +0100
Subject: compiler_attributes.h: Add __disable_sanitizer_instrumentation

The new attribute maps to
__attribute__((disable_sanitizer_instrumentation)), which will be
supported by Clang >= 14.0. Future support in GCC is also possible.

This attribute disables compiler instrumentation for kernel sanitizer
tools, making it easier to implement noinstr. It is different from the
existing __no_sanitize* attributes, which may still allow certain types
of instrumentation to prevent false positives.

Signed-off-by: Alexander Potapenko <glider@google.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/compiler_attributes.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index b9121afd8733..37e260020221 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -308,6 +308,24 @@
 # define __compiletime_warning(msg)
 #endif
 
+/*
+ * Optional: only supported since clang >= 14.0
+ *
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#disable-sanitizer-instrumentation
+ *
+ * disable_sanitizer_instrumentation is not always similar to
+ * no_sanitize((<sanitizer-name>)): the latter may still let specific sanitizers
+ * insert code into functions to prevent false positives. Unlike that,
+ * disable_sanitizer_instrumentation prevents all kinds of instrumentation to
+ * functions with the attribute.
+ */
+#if __has_attribute(disable_sanitizer_instrumentation)
+# define __disable_sanitizer_instrumentation \
+	 __attribute__((disable_sanitizer_instrumentation))
+#else
+# define __disable_sanitizer_instrumentation
+#endif
+
 /*
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-weak-function-attribute
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-weak-variable-attribute
-- 
cgit v1.2.3


From bd3d5bd1a0ad386475ea7a3de8a91e7d8a600536 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 30 Nov 2021 12:44:33 +0100
Subject: kcsan: Support WEAK_MEMORY with Clang where no objtool support exists

Clang and GCC behave a little differently when it comes to the
__no_sanitize_thread attribute, which has valid reasons, and depending
on context either one could be right.

Traditionally, user space ThreadSanitizer [1] still expects instrumented
builtin atomics (to avoid false positives) and __tsan_func_{entry,exit}
(to generate meaningful stack traces), even if the function has the
attribute no_sanitize("thread").

[1] https://clang.llvm.org/docs/ThreadSanitizer.html#attribute-no-sanitize-thread

GCC doesn't follow the same policy (for better or worse), and removes
all kinds of instrumentation if no_sanitize is added. Arguably, since
this may be a problem for user space ThreadSanitizer, we expect this may
change in future.

Since KCSAN != ThreadSanitizer, the likelihood of false positives even
without barrier instrumentation everywhere, is much lower by design.

At least for Clang, however, to fully remove all sanitizer
instrumentation, we must add the disable_sanitizer_instrumentation
attribute, which is available since Clang 14.0.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/compiler_types.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 1d32f4c03c9e..3c1795fdb568 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -198,9 +198,20 @@ struct ftrace_likely_data {
 # define __no_kasan_or_inline __always_inline
 #endif
 
-#define __no_kcsan __no_sanitize_thread
 #ifdef __SANITIZE_THREAD__
+/*
+ * Clang still emits instrumentation for __tsan_func_{entry,exit}() and builtin
+ * atomics even with __no_sanitize_thread (to avoid false positives in userspace
+ * ThreadSanitizer). The kernel's requirements are stricter and we really do not
+ * want any instrumentation with __no_kcsan.
+ *
+ * Therefore we add __disable_sanitizer_instrumentation where available to
+ * disable all instrumentation. See Kconfig.kcsan where this is mandatory.
+ */
+# define __no_kcsan __no_sanitize_thread __disable_sanitizer_instrumentation
 # define __no_sanitize_or_inline __no_kcsan notrace __maybe_unused
+#else
+# define __no_kcsan
 #endif
 
 #ifndef __no_sanitize_or_inline
-- 
cgit v1.2.3


From 80d7476fa20a3cc83f76b3b02a7575891d1e7511 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Sat, 4 Dec 2021 13:57:03 +0100
Subject: kcsan: Turn barrier instrumentation into macros

Some architectures use barriers in 'extern inline' functions, from which
we should not refer to static inline functions.

For example, building Alpha with gcc and W=1 shows:

./include/asm-generic/barrier.h:70:30: warning: 'kcsan_rmb' is static but used in inline function 'pmd_offset' which is not static
   70 | #define smp_rmb()       do { kcsan_rmb(); __smp_rmb(); } while (0)
      |                              ^~~~~~~~~
./arch/alpha/include/asm/pgtable.h:293:9: note: in expansion of macro 'smp_rmb'
  293 |         smp_rmb(); /* see above */
      |         ^~~~~~~

Which seems to warn about 6.7.4#3 of the C standard:
  "An inline definition of a function with external linkage shall not
   contain a definition of a modifiable object with static or thread
   storage duration, and shall not contain a reference to an identifier
   with internal linkage."

Fix it by turning barrier instrumentation into macros, which matches
definitions in <asm/barrier.h>.

Perhaps we can revert this change in future, when there are no more
'extern inline' users left.

Link: https://lkml.kernel.org/r/202112041334.X44uWZXf-lkp@intel.com
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/kcsan-checks.h | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h
index 9d2c869167f2..92f3843d9ebb 100644
--- a/include/linux/kcsan-checks.h
+++ b/include/linux/kcsan-checks.h
@@ -241,28 +241,30 @@ static inline void __kcsan_disable_current(void) { }
  * disabled with the __no_kcsan function attribute.
  *
  * Also see definition of __tsan_atomic_signal_fence() in kernel/kcsan/core.c.
+ *
+ * These are all macros, like <asm/barrier.h>, since some architectures use them
+ * in non-static inline functions.
  */
 #define __KCSAN_BARRIER_TO_SIGNAL_FENCE(name)					\
-	static __always_inline void kcsan_##name(void)				\
-	{									\
+	do {									\
 		barrier();							\
 		__atomic_signal_fence(__KCSAN_BARRIER_TO_SIGNAL_FENCE_##name);	\
 		barrier();							\
-	}
-__KCSAN_BARRIER_TO_SIGNAL_FENCE(mb)
-__KCSAN_BARRIER_TO_SIGNAL_FENCE(wmb)
-__KCSAN_BARRIER_TO_SIGNAL_FENCE(rmb)
-__KCSAN_BARRIER_TO_SIGNAL_FENCE(release)
+	} while (0)
+#define kcsan_mb()	__KCSAN_BARRIER_TO_SIGNAL_FENCE(mb)
+#define kcsan_wmb()	__KCSAN_BARRIER_TO_SIGNAL_FENCE(wmb)
+#define kcsan_rmb()	__KCSAN_BARRIER_TO_SIGNAL_FENCE(rmb)
+#define kcsan_release()	__KCSAN_BARRIER_TO_SIGNAL_FENCE(release)
 #elif defined(CONFIG_KCSAN_WEAK_MEMORY) && defined(__KCSAN_INSTRUMENT_BARRIERS__)
 #define kcsan_mb	__kcsan_mb
 #define kcsan_wmb	__kcsan_wmb
 #define kcsan_rmb	__kcsan_rmb
 #define kcsan_release	__kcsan_release
 #else /* CONFIG_KCSAN_WEAK_MEMORY && ... */
-static inline void kcsan_mb(void)		{ }
-static inline void kcsan_wmb(void)		{ }
-static inline void kcsan_rmb(void)		{ }
-static inline void kcsan_release(void)		{ }
+#define kcsan_mb()	do { } while (0)
+#define kcsan_wmb()	do { } while (0)
+#define kcsan_rmb()	do { } while (0)
+#define kcsan_release()	do { } while (0)
 #endif /* CONFIG_KCSAN_WEAK_MEMORY && ... */
 
 /**
-- 
cgit v1.2.3


From db67097aa6f2587b44055f2e16db72a11e17faef Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 9 Dec 2021 21:31:56 -0700
Subject: pktdvd: stop using bdi congestion framework.

The bdi congestion framework isn't widely used and should be
deprecated.

pktdvd makes use of it to track congestion, but this can be done
entirely internally to pktdvd, so it doesn't need to use the framework.

So introduce a "congested" flag.  When waiting for bio_queue_size to
drop, set this flag and a var_waitqueue() to wait for it.  When
bio_queue_size does drop and this flag is set, clear the flag and call
wake_up_var().

We don't use a wait_var_event macro for the waiting as we need to set
the flag and drop the spinlock before calling schedule() and while that
is possible with __wait_var_event(), result is not easy to read.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
Link: https://lore.kernel.org/r/163910843527.9928.857338663717630212@noble.neil.brown.name
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/pktcdvd.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 174601554b06..c391e694aa26 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -183,6 +183,8 @@ struct pktcdvd_device
 	spinlock_t		lock;		/* Serialize access to bio_queue */
 	struct rb_root		bio_queue;	/* Work queue of bios we need to handle */
 	int			bio_queue_size;	/* Number of nodes in bio_queue */
+	bool			congested;	/* Someone is waiting for bio_queue_size
+						 * to drop. */
 	sector_t		current_sector;	/* Keep track of where the elevator is */
 	atomic_t		scan_queue;	/* Set to non-zero when pkt_handle_queue */
 						/* needs to be run. */
-- 
cgit v1.2.3


From f95711242390d759f69fd67ad46b31491fe904d6 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Wed, 8 Dec 2021 17:43:53 +0000
Subject: EDAC: Add RDDR5 and LRDDR5 memory types

Include Registered-DDR5 and Load-Reduced DDR5 in the list of memory
types.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20211208174356.1997855-2-yazen.ghannam@amd.com
---
 include/linux/edac.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/edac.h b/include/linux/edac.h
index 4207d06996a4..e730b3468719 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -182,6 +182,8 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  * @MEM_LRDDR4:		Load-Reduced DDR4 memory.
  * @MEM_LPDDR4:		Low-Power DDR4 memory.
  * @MEM_DDR5:		Unbuffered DDR5 RAM
+ * @MEM_RDDR5:		Registered DDR5 RAM
+ * @MEM_LRDDR5:		Load-Reduced DDR5 memory.
  * @MEM_NVDIMM:		Non-volatile RAM
  * @MEM_WIO2:		Wide I/O 2.
  * @MEM_HBM2:		High bandwidth Memory Gen 2.
@@ -211,6 +213,8 @@ enum mem_type {
 	MEM_LRDDR4,
 	MEM_LPDDR4,
 	MEM_DDR5,
+	MEM_RDDR5,
+	MEM_LRDDR5,
 	MEM_NVDIMM,
 	MEM_WIO2,
 	MEM_HBM2,
@@ -239,6 +243,8 @@ enum mem_type {
 #define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
 #define MEM_FLAG_LPDDR4         BIT(MEM_LPDDR4)
 #define MEM_FLAG_DDR5           BIT(MEM_DDR5)
+#define MEM_FLAG_RDDR5          BIT(MEM_RDDR5)
+#define MEM_FLAG_LRDDR5         BIT(MEM_LRDDR5)
 #define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
 #define MEM_FLAG_WIO2		BIT(MEM_WIO2)
 #define MEM_FLAG_HBM2		BIT(MEM_HBM2)
-- 
cgit v1.2.3


From 1614b2b11fab29dd4ff31ebba9d266961f5af69e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 29 Nov 2021 14:28:41 +0000
Subject: arch: Make ARCH_STACKWALK independent of STACKTRACE

Make arch_stack_walk() available for ARCH_STACKWALK architectures
without it being entangled in STACKTRACE.

Link: https://lore.kernel.org/lkml/20211022152104.356586621@infradead.org/
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[Mark: rebase, drop unnecessary arm change]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Link: https://lore.kernel.org/r/20211129142849.3056714-2-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/stacktrace.h | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index bef158815e83..97455880ac41 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -8,22 +8,6 @@
 struct task_struct;
 struct pt_regs;
 
-#ifdef CONFIG_STACKTRACE
-void stack_trace_print(const unsigned long *trace, unsigned int nr_entries,
-		       int spaces);
-int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries,
-			unsigned int nr_entries, int spaces);
-unsigned int stack_trace_save(unsigned long *store, unsigned int size,
-			      unsigned int skipnr);
-unsigned int stack_trace_save_tsk(struct task_struct *task,
-				  unsigned long *store, unsigned int size,
-				  unsigned int skipnr);
-unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
-				   unsigned int size, unsigned int skipnr);
-unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
-unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
-
-/* Internal interfaces. Do not use in generic code */
 #ifdef CONFIG_ARCH_STACKWALK
 
 /**
@@ -76,8 +60,25 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie,
 
 void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
 			  const struct pt_regs *regs);
+#endif /* CONFIG_ARCH_STACKWALK */
 
-#else /* CONFIG_ARCH_STACKWALK */
+#ifdef CONFIG_STACKTRACE
+void stack_trace_print(const unsigned long *trace, unsigned int nr_entries,
+		       int spaces);
+int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries,
+			unsigned int nr_entries, int spaces);
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+			      unsigned int skipnr);
+unsigned int stack_trace_save_tsk(struct task_struct *task,
+				  unsigned long *store, unsigned int size,
+				  unsigned int skipnr);
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+				   unsigned int size, unsigned int skipnr);
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
+
+#ifndef CONFIG_ARCH_STACKWALK
+/* Internal interfaces. Do not use in generic code */
 struct stack_trace {
 	unsigned int nr_entries, max_entries;
 	unsigned long *entries;
-- 
cgit v1.2.3


From 9ba74e6c9e9d0c5c1e5792a7111fc7d1a0589cb8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 9 Dec 2021 23:44:21 -0800
Subject: net: add networking namespace refcount tracker

We have 100+ syzbot reports about netns being dismantled too soon,
still unresolved as of today.

We think a missing get_net() or an extra put_net() is the root cause.

In order to find the bug(s), and be able to spot future ones,
this patch adds CONFIG_NET_NS_REFCNT_TRACKER and new helpers
to precisely pair all put_net() with corresponding get_net().

To use these helpers, each data structure owning a refcount
should also use a "netns_tracker" to pair the get and put.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1a748ee9a421..235d5d082f1a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -48,7 +48,7 @@
 #include <uapi/linux/pkt_cls.h>
 #include <linux/hashtable.h>
 #include <linux/rbtree.h>
-#include <linux/ref_tracker.h>
+#include <net/net_trackers.h>
 
 struct netpoll_info;
 struct device;
@@ -300,13 +300,6 @@ enum netdev_state_t {
 	__LINK_STATE_TESTING,
 };
 
-
-#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
-typedef struct ref_tracker *netdevice_tracker;
-#else
-typedef struct {} netdevice_tracker;
-#endif
-
 struct gro_list {
 	struct list_head	list;
 	int			count;
-- 
cgit v1.2.3


From 04a931e58d1944ab3d1e11fdfde1947fbe5b6a37 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 9 Dec 2021 23:44:23 -0800
Subject: net: add netns refcount tracker to struct seq_net_private

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/seq_file_net.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_file_net.h b/include/linux/seq_file_net.h
index 0fdbe1ddd8d1..b97912fdbae7 100644
--- a/include/linux/seq_file_net.h
+++ b/include/linux/seq_file_net.h
@@ -9,7 +9,8 @@ extern struct net init_net;
 
 struct seq_net_private {
 #ifdef CONFIG_NET_NS
-	struct net *net;
+	struct net	*net;
+	netns_tracker	ns_tracker;
 #endif
 };
 
-- 
cgit v1.2.3


From 65c7cdedeb3026fabcc967a7aae2f755ad4d0783 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Sep 2021 11:24:17 -0400
Subject: genirq: Provide new interfaces for affinity hints

The discussion about removing the side effect of irq_set_affinity_hint() of
actually applying the cpumask (if not NULL) as affinity to the interrupt,
unearthed a few unpleasantries:

  1) The modular perf drivers rely on the current behaviour for the very
     wrong reasons.

  2) While none of the other drivers prevents user space from changing
     the affinity, a cursorily inspection shows that there are at least
     expectations in some drivers.

#1 needs to be cleaned up anyway, so that's not a problem

#2 might result in subtle regressions especially when irqbalanced (which
   nowadays ignores the affinity hint) is disabled.

Provide new interfaces:

  irq_update_affinity_hint()  - Only sets the affinity hint pointer
  irq_set_affinity_and_hint() - Set the pointer and apply the affinity to
                                the interrupt

Make irq_set_affinity_hint() a wrapper around irq_apply_affinity_hint() and
document it to be phased out.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20210501021832.743094-1-jesse.brandeburg@intel.com
Link: https://lore.kernel.org/r/20210903152430.244937-2-nitesh@redhat.com
---
 include/linux/interrupt.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 1f22a30c0963..9367f1cb2e3c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -329,7 +329,46 @@ extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask);
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
 
-extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
+extern int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
+				     bool setaffinity);
+
+/**
+ * irq_update_affinity_hint - Update the affinity hint
+ * @irq:	Interrupt to update
+ * @m:		cpumask pointer (NULL to clear the hint)
+ *
+ * Updates the affinity hint, but does not change the affinity of the interrupt.
+ */
+static inline int
+irq_update_affinity_hint(unsigned int irq, const struct cpumask *m)
+{
+	return __irq_apply_affinity_hint(irq, m, false);
+}
+
+/**
+ * irq_set_affinity_and_hint - Update the affinity hint and apply the provided
+ *			     cpumask to the interrupt
+ * @irq:	Interrupt to update
+ * @m:		cpumask pointer (NULL to clear the hint)
+ *
+ * Updates the affinity hint and if @m is not NULL it applies it as the
+ * affinity of that interrupt.
+ */
+static inline int
+irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m)
+{
+	return __irq_apply_affinity_hint(irq, m, true);
+}
+
+/*
+ * Deprecated. Use irq_update_affinity_hint() or irq_set_affinity_and_hint()
+ * instead.
+ */
+static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
+{
+	return irq_set_affinity_and_hint(irq, m);
+}
+
 extern int irq_update_affinity_desc(unsigned int irq,
 				    struct irq_affinity_desc *affinity);
 
@@ -361,6 +400,18 @@ static inline int irq_can_set_affinity(unsigned int irq)
 
 static inline int irq_select_affinity(unsigned int irq)  { return 0; }
 
+static inline int irq_update_affinity_hint(unsigned int irq,
+					   const struct cpumask *m)
+{
+	return -EINVAL;
+}
+
+static inline int irq_set_affinity_and_hint(unsigned int irq,
+					    const struct cpumask *m)
+{
+	return -EINVAL;
+}
+
 static inline int irq_set_affinity_hint(unsigned int irq,
 					const struct cpumask *m)
 {
-- 
cgit v1.2.3


From e4779015fd5d2fb8390c258268addff24d6077c7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:22 -0800
Subject: timers: implement usleep_idle_range()

Patch series "mm/damon: Fix fake /proc/loadavg reports", v3.

This patchset fixes DAMON's fake load report issue.  The first patch
makes yet another variant of usleep_range() for this fix, and the second
patch fixes the issue of DAMON by making it using the newly introduced
function.

This patch (of 2):

Some kernel threads such as DAMON could need to repeatedly sleep in
micro seconds level.  Because usleep_range() sleeps in uninterruptible
state, however, such threads would make /proc/loadavg reports fake load.

To help such cases, this commit implements a variant of usleep_range()
called usleep_idle_range().  It is same to usleep_range() but sets the
state of the current task as TASK_IDLE while sleeping.

Link: https://lkml.kernel.org/r/20211126145015.15862-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211126145015.15862-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: John Stultz <john.stultz@linaro.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delay.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index 8eacf67eb212..039e7e0c7378 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -20,6 +20,7 @@
  */
 
 #include <linux/math.h>
+#include <linux/sched.h>
 
 extern unsigned long loops_per_jiffy;
 
@@ -58,7 +59,18 @@ void calibrate_delay(void);
 void __attribute__((weak)) calibration_delay_done(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
-void usleep_range(unsigned long min, unsigned long max);
+void usleep_range_state(unsigned long min, unsigned long max,
+			unsigned int state);
+
+static inline void usleep_range(unsigned long min, unsigned long max)
+{
+	usleep_range_state(min, max, TASK_UNINTERRUPTIBLE);
+}
+
+static inline void usleep_idle_range(unsigned long min, unsigned long max)
+{
+	usleep_range_state(min, max, TASK_IDLE);
+}
 
 static inline void ssleep(unsigned int seconds)
 {
-- 
cgit v1.2.3


From bff8c3848e071d387d8b0784dc91fa49cd563774 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 10 Nov 2021 11:01:03 +0100
Subject: bitfield.h: Fix "type of reg too small for mask" test

The test: 'mask > (typeof(_reg))~0ull' only works correctly when both
sides are unsigned, consider:

 - 0xff000000 vs (int)~0ull
 - 0x000000ff vs (int)~0ull

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20211110101324.950210584@infradead.org
---
 include/linux/bitfield.h | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 4e035aca6f7e..6093fa6db260 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -41,6 +41,22 @@
 
 #define __bf_shf(x) (__builtin_ffsll(x) - 1)
 
+#define __scalar_type_to_unsigned_cases(type)				\
+		unsigned type:	(unsigned type)0,			\
+		signed type:	(unsigned type)0
+
+#define __unsigned_scalar_typeof(x) typeof(				\
+		_Generic((x),						\
+			char:	(unsigned char)0,			\
+			__scalar_type_to_unsigned_cases(char),		\
+			__scalar_type_to_unsigned_cases(short),		\
+			__scalar_type_to_unsigned_cases(int),		\
+			__scalar_type_to_unsigned_cases(long),		\
+			__scalar_type_to_unsigned_cases(long long),	\
+			default: (x)))
+
+#define __bf_cast_unsigned(type, x)	((__unsigned_scalar_typeof(type))(x))
+
 #define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx)			\
 	({								\
 		BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask),		\
@@ -49,7 +65,8 @@
 		BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ?		\
 				 ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \
 				 _pfx "value too large for the field"); \
-		BUILD_BUG_ON_MSG((_mask) > (typeof(_reg))~0ull,		\
+		BUILD_BUG_ON_MSG(__bf_cast_unsigned(_mask, _mask) >	\
+				 __bf_cast_unsigned(_reg, ~0ull),	\
 				 _pfx "type of reg too small for mask"); \
 		__BUILD_BUG_ON_NOT_POWER_OF_2((_mask) +			\
 					      (1ULL << __bf_shf(_mask))); \
-- 
cgit v1.2.3


From 4121485d271bd730537f613ce041e7ea659606a7 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 9 Dec 2021 21:52:31 +0200
Subject: PCI: Sort Intel Device IDs by value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sort Intel Device IDs by value.

[bhelgaas: lower-case Intel section since we're touching it anyway]
Link: https://lore.kernel.org/r/20211209195231.2785-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Krzysztof Wilczyński <kw@linux.com>
---
 include/linux/pci_ids.h | 50 ++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 011f2f1ea5bb..0d26ab7eb7dc 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2635,8 +2635,8 @@
 #define PCI_DEVICE_ID_INTEL_PXHD_0	0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1	0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0	0x0329
-#define PCI_DEVICE_ID_INTEL_PXH_1	0x032A
-#define PCI_DEVICE_ID_INTEL_PXHV	0x032C
+#define PCI_DEVICE_ID_INTEL_PXH_1	0x032a
+#define PCI_DEVICE_ID_INTEL_PXHV	0x032c
 #define PCI_DEVICE_ID_INTEL_80332_0	0x0330
 #define PCI_DEVICE_ID_INTEL_80332_1	0x0332
 #define PCI_DEVICE_ID_INTEL_80333_0	0x0370
@@ -2654,14 +2654,14 @@
 #define PCI_DEVICE_ID_INTEL_MFD_SDIO2	0x0822
 #define PCI_DEVICE_ID_INTEL_MFD_EMMC0	0x0823
 #define PCI_DEVICE_ID_INTEL_MFD_EMMC1	0x0824
-#define PCI_DEVICE_ID_INTEL_MRST_SD2	0x084F
-#define PCI_DEVICE_ID_INTEL_QUARK_X1000_ILB	0x095E
+#define PCI_DEVICE_ID_INTEL_MRST_SD2	0x084f
+#define PCI_DEVICE_ID_INTEL_QUARK_X1000_ILB	0x095e
 #define PCI_DEVICE_ID_INTEL_I960	0x0960
 #define PCI_DEVICE_ID_INTEL_I960RM	0x0962
 #define PCI_DEVICE_ID_INTEL_CENTERTON_ILB	0x0c60
 #define PCI_DEVICE_ID_INTEL_8257X_SOL	0x1062
 #define PCI_DEVICE_ID_INTEL_82573E_SOL	0x1085
-#define PCI_DEVICE_ID_INTEL_82573L_SOL	0x108F
+#define PCI_DEVICE_ID_INTEL_82573L_SOL	0x108f
 #define PCI_DEVICE_ID_INTEL_82815_MC	0x1130
 #define PCI_DEVICE_ID_INTEL_82815_CGC	0x1132
 #define PCI_DEVICE_ID_INTEL_82092AA_0	0x1221
@@ -2755,12 +2755,6 @@
 #define PCI_DEVICE_ID_INTEL_82801EB_11	0x24db
 #define PCI_DEVICE_ID_INTEL_82801EB_12	0x24dc
 #define PCI_DEVICE_ID_INTEL_82801EB_13	0x24dd
-#define PCI_DEVICE_ID_INTEL_ESB_1	0x25a1
-#define PCI_DEVICE_ID_INTEL_ESB_2	0x25a2
-#define PCI_DEVICE_ID_INTEL_ESB_4	0x25a4
-#define PCI_DEVICE_ID_INTEL_ESB_5	0x25a6
-#define PCI_DEVICE_ID_INTEL_ESB_9	0x25ab
-#define PCI_DEVICE_ID_INTEL_ESB_10	0x25ac
 #define PCI_DEVICE_ID_INTEL_82820_HB	0x2500
 #define PCI_DEVICE_ID_INTEL_82820_UP_HB	0x2501
 #define PCI_DEVICE_ID_INTEL_82850_HB	0x2530
@@ -2775,14 +2769,15 @@
 #define PCI_DEVICE_ID_INTEL_82915G_IG	0x2582
 #define PCI_DEVICE_ID_INTEL_82915GM_HB	0x2590
 #define PCI_DEVICE_ID_INTEL_82915GM_IG	0x2592
-#define PCI_DEVICE_ID_INTEL_5000_ERR	0x25F0
-#define PCI_DEVICE_ID_INTEL_5000_FBD0	0x25F5
-#define PCI_DEVICE_ID_INTEL_5000_FBD1	0x25F6
-#define PCI_DEVICE_ID_INTEL_82945G_HB	0x2770
-#define PCI_DEVICE_ID_INTEL_82945G_IG	0x2772
-#define PCI_DEVICE_ID_INTEL_3000_HB	0x2778
-#define PCI_DEVICE_ID_INTEL_82945GM_HB	0x27A0
-#define PCI_DEVICE_ID_INTEL_82945GM_IG	0x27A2
+#define PCI_DEVICE_ID_INTEL_ESB_1	0x25a1
+#define PCI_DEVICE_ID_INTEL_ESB_2	0x25a2
+#define PCI_DEVICE_ID_INTEL_ESB_4	0x25a4
+#define PCI_DEVICE_ID_INTEL_ESB_5	0x25a6
+#define PCI_DEVICE_ID_INTEL_ESB_9	0x25ab
+#define PCI_DEVICE_ID_INTEL_ESB_10	0x25ac
+#define PCI_DEVICE_ID_INTEL_5000_ERR	0x25f0
+#define PCI_DEVICE_ID_INTEL_5000_FBD0	0x25f5
+#define PCI_DEVICE_ID_INTEL_5000_FBD1	0x25f6
 #define PCI_DEVICE_ID_INTEL_ICH6_0	0x2640
 #define PCI_DEVICE_ID_INTEL_ICH6_1	0x2641
 #define PCI_DEVICE_ID_INTEL_ICH6_2	0x2642
@@ -2794,6 +2789,11 @@
 #define PCI_DEVICE_ID_INTEL_ESB2_14	0x2698
 #define PCI_DEVICE_ID_INTEL_ESB2_17	0x269b
 #define PCI_DEVICE_ID_INTEL_ESB2_18	0x269e
+#define PCI_DEVICE_ID_INTEL_82945G_HB	0x2770
+#define PCI_DEVICE_ID_INTEL_82945G_IG	0x2772
+#define PCI_DEVICE_ID_INTEL_3000_HB	0x2778
+#define PCI_DEVICE_ID_INTEL_82945GM_HB	0x27a0
+#define PCI_DEVICE_ID_INTEL_82945GM_IG	0x27a2
 #define PCI_DEVICE_ID_INTEL_ICH7_0	0x27b8
 #define PCI_DEVICE_ID_INTEL_ICH7_1	0x27b9
 #define PCI_DEVICE_ID_INTEL_ICH7_30	0x27b0
@@ -2846,7 +2846,7 @@
 #define PCI_DEVICE_ID_INTEL_LYNNFIELD_QPI_PHY0    0x2c91
 #define PCI_DEVICE_ID_INTEL_LYNNFIELD_MCR         0x2c98
 #define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_TAD      0x2c99
-#define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_TEST     0x2c9C
+#define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_TEST     0x2c9c
 #define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_CH0_CTRL 0x2ca0
 #define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_CH0_ADDR 0x2ca1
 #define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_CH0_RANK 0x2ca2
@@ -2958,16 +2958,16 @@
 #define PCI_DEVICE_ID_INTEL_SBRIDGE_BR		0x3cf5	/* 13.6 */
 #define PCI_DEVICE_ID_INTEL_SBRIDGE_SAD1	0x3cf6	/* 12.7 */
 #define PCI_DEVICE_ID_INTEL_IOAT_SNB	0x402f
-#define PCI_DEVICE_ID_INTEL_5100_16	0x65f0
-#define PCI_DEVICE_ID_INTEL_5100_19	0x65f3
-#define PCI_DEVICE_ID_INTEL_5100_21	0x65f5
-#define PCI_DEVICE_ID_INTEL_5100_22	0x65f6
 #define PCI_DEVICE_ID_INTEL_5400_ERR	0x4030
 #define PCI_DEVICE_ID_INTEL_5400_FBD0	0x4035
 #define PCI_DEVICE_ID_INTEL_5400_FBD1	0x4036
-#define PCI_DEVICE_ID_INTEL_IOAT_SCNB	0x65ff
 #define PCI_DEVICE_ID_INTEL_EP80579_0	0x5031
 #define PCI_DEVICE_ID_INTEL_EP80579_1	0x5032
+#define PCI_DEVICE_ID_INTEL_5100_16	0x65f0
+#define PCI_DEVICE_ID_INTEL_5100_19	0x65f3
+#define PCI_DEVICE_ID_INTEL_5100_21	0x65f5
+#define PCI_DEVICE_ID_INTEL_5100_22	0x65f6
+#define PCI_DEVICE_ID_INTEL_IOAT_SCNB	0x65ff
 #define PCI_DEVICE_ID_INTEL_82371SB_0	0x7000
 #define PCI_DEVICE_ID_INTEL_82371SB_1	0x7010
 #define PCI_DEVICE_ID_INTEL_82371SB_2	0x7020
-- 
cgit v1.2.3


From c5fb19937455095573a19ddcbff32e993ed10e35 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 10 Dec 2021 22:16:49 +0800
Subject: bpf: Add bpf_strncmp helper

The helper compares two strings: one string is a null-terminated
read-only string, and another string has const max storage size
but doesn't need to be null-terminated. It can be used to compare
file name in tracing or LSM program.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211210141652.877186-2-houtao1@huawei.com
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0ceb54c6342f..7a40022e3d00 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2163,6 +2163,7 @@ extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
 extern const struct bpf_func_proto bpf_loop_proto;
+extern const struct bpf_func_proto bpf_strncmp_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
-- 
cgit v1.2.3


From 35d976802124303a5b3eb7ec3ed188d568204373 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 10 Dec 2021 01:34:38 +0200
Subject: net: dsa: tag_ocelot: convert to tagger-owned data

The felix driver makes very light use of dp->priv, and the tagger is
effectively stateless. dp->priv is practically only needed to set up a
callback to perform deferred xmit of PTP and STP packets using the
ocelot-8021q tagging protocol (the main ocelot tagging protocol makes no
use of dp->priv, although this driver sets up dp->priv irrespective of
actual tagging protocol in use).

struct felix_port (what used to be pointed to by dp->priv) is removed
and replaced with a two-sided structure. The public side of this
structure, visible to the switch driver, is ocelot_8021q_tagger_data.
The private side is ocelot_8021q_tagger_private, and the latter
structure physically encapsulates the former. The public half of the
tagger data structure can be accessed through a helper of the same name
(ocelot_8021q_tagger_data) which also sanity-checks the protocol
currently in use by the switch. The public/private split was requested
by Andrew Lunn.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/ocelot.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
index 7ee708ad7df2..dca2969015d8 100644
--- a/include/linux/dsa/ocelot.h
+++ b/include/linux/dsa/ocelot.h
@@ -8,6 +8,7 @@
 #include <linux/kthread.h>
 #include <linux/packing.h>
 #include <linux/skbuff.h>
+#include <net/dsa.h>
 
 struct ocelot_skb_cb {
 	struct sk_buff *clone;
@@ -168,11 +169,18 @@ struct felix_deferred_xmit_work {
 	struct kthread_work work;
 };
 
-struct felix_port {
+struct ocelot_8021q_tagger_data {
 	void (*xmit_work_fn)(struct kthread_work *work);
-	struct kthread_worker *xmit_worker;
 };
 
+static inline struct ocelot_8021q_tagger_data *
+ocelot_8021q_tagger_data(struct dsa_switch *ds)
+{
+	BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_OCELOT_8021Q);
+
+	return ds->tagger_data;
+}
+
 static inline void ocelot_xfh_get_rew_val(void *extraction, u64 *rew_val)
 {
 	packing(extraction, rew_val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0);
-- 
cgit v1.2.3


From d38049bbe7601f38d598f5da5ff09980483b290a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 10 Dec 2021 01:34:40 +0200
Subject: net: dsa: sja1105: bring deferred xmit implementation in line with
 ocelot-8021q

When the ocelot-8021q driver was converted to deferred xmit as part of
commit 8d5f7954b7c8 ("net: dsa: felix: break at first CPU port during
init and teardown"), the deferred implementation was deliberately made
subtly different from what sja1105 has.

The implementation differences lied on the following observations:

- There might be a race between these two lines in tag_sja1105.c:

       skb_queue_tail(&sp->xmit_queue, skb_get(skb));
       kthread_queue_work(sp->xmit_worker, &sp->xmit_work);

  and the skb dequeue logic in sja1105_port_deferred_xmit(). For
  example, the xmit_work might be already queued, however the work item
  has just finished walking through the skb queue. Because we don't
  check the return code from kthread_queue_work, we don't do anything if
  the work item is already queued.

  However, nobody will take that skb and send it, at least until the
  next timestampable skb is sent. This creates additional (and
  avoidable) TX timestamping latency.

  To close that race, what the ocelot-8021q driver does is it doesn't
  keep a single work item per port, and a skb timestamping queue, but
  rather dynamically allocates a work item per packet.

- It is also unnecessary to have more than one kthread that does the
  work. So delete the per-port kthread allocations and replace them with
  a single kthread which is global to the switch.

This change brings the two implementations in line by applying those
observations to the sja1105 driver as well.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index e6c78be40bde..acd9d2afccab 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -37,6 +37,12 @@
 
 #define SJA1105_HWTS_RX_EN			0
 
+struct sja1105_deferred_xmit_work {
+	struct dsa_port *dp;
+	struct sk_buff *skb;
+	struct kthread_work work;
+};
+
 /* Global tagger data: each struct sja1105_port has a reference to
  * the structure defined in struct sja1105_private.
  */
@@ -52,6 +58,8 @@ struct sja1105_tagger_data {
 	 * 2-step TX timestamps
 	 */
 	struct sk_buff_head skb_txtstamp_queue;
+	struct kthread_worker *xmit_worker;
+	void (*xmit_work_fn)(struct kthread_work *work);
 };
 
 struct sja1105_skb_cb {
@@ -65,9 +73,6 @@ struct sja1105_skb_cb {
 	((struct sja1105_skb_cb *)((skb)->cb))
 
 struct sja1105_port {
-	struct kthread_worker *xmit_worker;
-	struct kthread_work xmit_work;
-	struct sk_buff_head xmit_queue;
 	struct sja1105_tagger_data *data;
 	bool hwts_tx_en;
 };
-- 
cgit v1.2.3


From 6f6770ab1ce2b56619264ec6be0b62f05564dcf6 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 10 Dec 2021 01:34:41 +0200
Subject: net: dsa: sja1105: remove hwts_tx_en from tagger data

This tagger property is in fact not used at all by the tagger, only by
the switch driver. Therefore it makes sense to be moved to
sja1105_private.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index acd9d2afccab..32a8a1344cf6 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -74,7 +74,6 @@ struct sja1105_skb_cb {
 
 struct sja1105_port {
 	struct sja1105_tagger_data *data;
-	bool hwts_tx_en;
 };
 
 /* Timestamps are in units of 8 ns clock ticks (equivalent to
-- 
cgit v1.2.3


From bfcf1425222008e7390c0784b0f3bb7b497fccaa Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 10 Dec 2021 01:34:42 +0200
Subject: net: dsa: sja1105: make dp->priv point directly to
 sja1105_tagger_data

The design of the sja1105 tagger dp->priv is that each port has a
separate struct sja1105_port, and the sp->data pointer points to a
common struct sja1105_tagger_data.

We have removed all per-port members accessible by the tagger, and now
only struct sja1105_tagger_data remains. Make dp->priv point directly to
this.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 32a8a1344cf6..1dda9cce85d9 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -43,9 +43,7 @@ struct sja1105_deferred_xmit_work {
 	struct kthread_work work;
 };
 
-/* Global tagger data: each struct sja1105_port has a reference to
- * the structure defined in struct sja1105_private.
- */
+/* Global tagger data */
 struct sja1105_tagger_data {
 	struct sk_buff *stampable_skb;
 	/* Protects concurrent access to the meta state machine
@@ -72,10 +70,6 @@ struct sja1105_skb_cb {
 #define SJA1105_SKB_CB(skb) \
 	((struct sja1105_skb_cb *)((skb)->cb))
 
-struct sja1105_port {
-	struct sja1105_tagger_data *data;
-};
-
 /* Timestamps are in units of 8 ns clock ticks (equivalent to
  * a fixed 125 MHz clock).
  */
-- 
cgit v1.2.3


From 22ee9f8e4011fb8a6d75dc2f9a43360d4f400235 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 10 Dec 2021 01:34:43 +0200
Subject: net: dsa: sja1105: move ts_id from sja1105_tagger_data

The TX timestamp ID is incremented by the SJA1110 PTP timestamping
callback (->port_tx_timestamp) for every packet, when cloning it.
It isn't used by the tagger at all, even though it sits inside the
struct sja1105_tagger_data.

Also, serialization to this structure is currently done through
tagger_data->meta_lock, which is a cheap hack because the meta_lock
isn't used for anything else on SJA1110 (sja1105_rcv_meta_state_machine
isn't called).

This change moves ts_id from sja1105_tagger_data to sja1105_private and
introduces a dedicated spinlock for it, also in sja1105_private.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 1dda9cce85d9..d8ee53085c09 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -51,7 +51,6 @@ struct sja1105_tagger_data {
 	 */
 	spinlock_t meta_lock;
 	unsigned long state;
-	u8 ts_id;
 	/* Used on SJA1110 where meta frames are generated only for
 	 * 2-step TX timestamps
 	 */
-- 
cgit v1.2.3


From c79e84866d2ac637fce921a28288f214e91d662b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 10 Dec 2021 01:34:44 +0200
Subject: net: dsa: tag_sja1105: convert to tagger-owned data

Currently, struct sja1105_tagger_data is a part of struct
sja1105_private, and is used by the sja1105 driver to populate dp->priv.

With the movement towards tagger-owned storage, the sja1105 driver
should not be the owner of this memory.

This change implements the connection between the sja1105 switch driver
and its tagging protocol, which means that sja1105_tagger_data no longer
stays in dp->priv but in ds->tagger_data, and that the sja1105 driver
now only populates the sja1105_port_deferred_xmit callback pointer.
The kthread worker is now the responsibility of the tagger.

The sja1105 driver also alters the tagger's state some more, especially
with regard to the PTP RX timestamping state. This will be fixed up a
bit in further changes.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index d8ee53085c09..9f7d42cbbc08 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -84,9 +84,12 @@ static inline s64 sja1105_ticks_to_ns(s64 ticks)
 	return ticks * SJA1105_TICK_NS;
 }
 
-static inline bool dsa_port_is_sja1105(struct dsa_port *dp)
+static inline struct sja1105_tagger_data *
+sja1105_tagger_data(struct dsa_switch *ds)
 {
-	return true;
+	BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1105);
+
+	return ds->tagger_data;
 }
 
 #endif /* _NET_DSA_SJA1105_H */
-- 
cgit v1.2.3


From fcbf979a5b4b5784bfb5647ae6190cd5c2ae595d Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 10 Dec 2021 01:34:45 +0200
Subject: Revert "net: dsa: move sja1110_process_meta_tstamp inside the tagging
 protocol driver"

This reverts commit 6d709cadfde68dbd12bef12fcced6222226dcb06.

The above change was done to avoid calling symbols exported by the
switch driver from the tagging protocol driver.

With the tagger-owned storage model, we have a new option on our hands,
and that is for the switch driver to provide a data consumer handler in
the form of a function pointer inside the ->connect_tag_protocol()
method. Having a function pointer avoids the problems of the exported
symbols approach.

By creating a handler for metadata frames holding TX timestamps on
SJA1110, we are able to eliminate an skb queue from the tagger data, and
replace it with a simple, and stateless, function pointer. This skb
queue is now handled exclusively by sja1105_ptp.c, which makes the code
easier to follow, as it used to be before the reverted patch.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 9f7d42cbbc08..d216211b64f8 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -37,6 +37,11 @@
 
 #define SJA1105_HWTS_RX_EN			0
 
+enum sja1110_meta_tstamp {
+	SJA1110_META_TSTAMP_TX = 0,
+	SJA1110_META_TSTAMP_RX = 1,
+};
+
 struct sja1105_deferred_xmit_work {
 	struct dsa_port *dp;
 	struct sk_buff *skb;
@@ -51,12 +56,10 @@ struct sja1105_tagger_data {
 	 */
 	spinlock_t meta_lock;
 	unsigned long state;
-	/* Used on SJA1110 where meta frames are generated only for
-	 * 2-step TX timestamps
-	 */
-	struct sk_buff_head skb_txtstamp_queue;
 	struct kthread_worker *xmit_worker;
 	void (*xmit_work_fn)(struct kthread_work *work);
+	void (*meta_tstamp_handler)(struct dsa_switch *ds, int port, u8 ts_id,
+				    enum sja1110_meta_tstamp dir, u64 tstamp);
 };
 
 struct sja1105_skb_cb {
@@ -69,21 +72,6 @@ struct sja1105_skb_cb {
 #define SJA1105_SKB_CB(skb) \
 	((struct sja1105_skb_cb *)((skb)->cb))
 
-/* Timestamps are in units of 8 ns clock ticks (equivalent to
- * a fixed 125 MHz clock).
- */
-#define SJA1105_TICK_NS			8
-
-static inline s64 ns_to_sja1105_ticks(s64 ns)
-{
-	return ns / SJA1105_TICK_NS;
-}
-
-static inline s64 sja1105_ticks_to_ns(s64 ticks)
-{
-	return ticks * SJA1105_TICK_NS;
-}
-
 static inline struct sja1105_tagger_data *
 sja1105_tagger_data(struct dsa_switch *ds)
 {
-- 
cgit v1.2.3


From 950a419d9de13668f86828394cb242a1f9dece74 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 10 Dec 2021 01:34:46 +0200
Subject: net: dsa: tag_sja1105: split sja1105_tagger_data into private and
 public sections

The sja1105 driver messes with the tagging protocol's state when PTP RX
timestamping is enabled/disabled. This is fundamentally necessary
because the tagger needs to know what to do when it receives a PTP
packet. If RX timestamping is enabled, then a metadata follow-up frame
is expected, and this holds the (partial) timestamp. So the tagger plays
hide-and-seek with the network stack until it also gets the metadata
frame, and then presents a single packet, the timestamped PTP packet.
But when RX timestamping isn't enabled, there is no metadata frame
expected, so the hide-and-seek game must be turned off and the packet
must be delivered right away to the network stack.

Considering this, we create a pseudo isolation by devising two tagger
methods callable by the switch: one to get the RX timestamping state,
and one to set it. Since we can't export symbols between the tagger and
the switch driver, these methods are exposed through function pointers.

After this change, the public portion of the sja1105_tagger_data
contains only function pointers.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index d216211b64f8..e9cb1ae6d742 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -35,8 +35,6 @@
 #define SJA1105_META_SMAC			0x222222222222ull
 #define SJA1105_META_DMAC			0x0180C200000Eull
 
-#define SJA1105_HWTS_RX_EN			0
-
 enum sja1110_meta_tstamp {
 	SJA1110_META_TSTAMP_TX = 0,
 	SJA1110_META_TSTAMP_RX = 1,
@@ -50,16 +48,13 @@ struct sja1105_deferred_xmit_work {
 
 /* Global tagger data */
 struct sja1105_tagger_data {
-	struct sk_buff *stampable_skb;
-	/* Protects concurrent access to the meta state machine
-	 * from taggers running on multiple ports on SMP systems
-	 */
-	spinlock_t meta_lock;
-	unsigned long state;
-	struct kthread_worker *xmit_worker;
+	/* Tagger to switch */
 	void (*xmit_work_fn)(struct kthread_work *work);
 	void (*meta_tstamp_handler)(struct dsa_switch *ds, int port, u8 ts_id,
 				    enum sja1110_meta_tstamp dir, u64 tstamp);
+	/* Switch to tagger */
+	bool (*rxtstamp_get_state)(struct dsa_switch *ds);
+	void (*rxtstamp_set_state)(struct dsa_switch *ds, bool on);
 };
 
 struct sja1105_skb_cb {
-- 
cgit v1.2.3


From 9020ef659885f2622cfb386cc229b6d618362895 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sun, 17 Oct 2021 18:22:09 +0100
Subject: iio: trigger: Fix a scheduling whilst atomic issue seen on tsc2046

IIO triggers are software IRQ chips that split an incoming IRQ into
separate IRQs routed to all devices using the trigger.
When all consumers are done then a trigger callback reenable() is
called.  There are a few circumstances under which this can happen
in atomic context.

1) A single user of the trigger that calls the iio_trigger_done()
function from interrupt context.
2) A race between disconnecting the last device from a trigger and
the trigger itself sucessfully being disabled.

To avoid a resulting scheduling whilst atomic, close this second corner
by using schedule_work() to ensure the reenable is not done in atomic
context.

Note that drivers must be careful to manage the interaction of
set_state() and reenable() callbacks to ensure appropriate reference
counting if they are relying on the same hardware controls.

Deliberately taking this the slow path rather than via a fixes tree
because the error has hard to hit and I would like it to soak for a while
before hitting a release kernel.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Pengutronix Kernel Team <kernel@pengutronix.de>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Tested-by: Oleksij Rempel <o.rempel@pengutronix.de>
Cc: <Stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20211017172209.112387-1-jic23@kernel.org
---
 include/linux/iio/trigger.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index 096f68dd2e0c..4c69b144677b 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -55,6 +55,7 @@ struct iio_trigger_ops {
  * @attached_own_device:[INTERN] if we are using our own device as trigger,
  *			i.e. if we registered a poll function to the same
  *			device as the one providing the trigger.
+ * @reenable_work:	[INTERN] work item used to ensure reenable can sleep.
  **/
 struct iio_trigger {
 	const struct iio_trigger_ops	*ops;
@@ -74,6 +75,7 @@ struct iio_trigger {
 	unsigned long pool[BITS_TO_LONGS(CONFIG_IIO_CONSUMERS_PER_TRIGGER)];
 	struct mutex			pool_lock;
 	bool				attached_own_device;
+	struct work_struct		reenable_work;
 };
 
 
-- 
cgit v1.2.3


From 3ac27afefd5dd6a53e830542b899f092a58b6b51 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sun, 5 Dec 2021 17:01:29 +0000
Subject: iio:dac:ad5755: Switch to generic firmware properties and drop pdata

Lars pointed out that platform data can also be supported via the
generic properties interface, so there is no point in continuing to
support it separately.  Hence squish the linux/platform_data/ad5755.h
header into the c file and drop accessing the platform data directly.

Done by inspection only.  Mostly completely mechanical with the
exception of a few places where default value handling is
cleaner done by first setting the value, then calling the
firmware reading function but and not checking the return value,
as opposed to reading firmware then setting the default if an error
occurs.

Part of general attempt to move all of IIO over to generic
device properties, both to enable other firmware types and
to remove drivers that can be the source of of_ specific
behaviour in new drivers.

Suggested-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
---
 include/linux/platform_data/ad5755.h | 102 -----------------------------------
 1 file changed, 102 deletions(-)
 delete mode 100644 include/linux/platform_data/ad5755.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/ad5755.h b/include/linux/platform_data/ad5755.h
deleted file mode 100644
index e371e08f04bc..000000000000
--- a/include/linux/platform_data/ad5755.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2012 Analog Devices Inc.
- */
-#ifndef __LINUX_PLATFORM_DATA_AD5755_H__
-#define __LINUX_PLATFORM_DATA_AD5755_H__
-
-enum ad5755_mode {
-	AD5755_MODE_VOLTAGE_0V_5V		= 0,
-	AD5755_MODE_VOLTAGE_0V_10V		= 1,
-	AD5755_MODE_VOLTAGE_PLUSMINUS_5V	= 2,
-	AD5755_MODE_VOLTAGE_PLUSMINUS_10V	= 3,
-	AD5755_MODE_CURRENT_4mA_20mA		= 4,
-	AD5755_MODE_CURRENT_0mA_20mA		= 5,
-	AD5755_MODE_CURRENT_0mA_24mA		= 6,
-};
-
-enum ad5755_dc_dc_phase {
-	AD5755_DC_DC_PHASE_ALL_SAME_EDGE		= 0,
-	AD5755_DC_DC_PHASE_A_B_SAME_EDGE_C_D_OPP_EDGE	= 1,
-	AD5755_DC_DC_PHASE_A_C_SAME_EDGE_B_D_OPP_EDGE	= 2,
-	AD5755_DC_DC_PHASE_90_DEGREE			= 3,
-};
-
-enum ad5755_dc_dc_freq {
-	AD5755_DC_DC_FREQ_250kHZ = 0,
-	AD5755_DC_DC_FREQ_410kHZ = 1,
-	AD5755_DC_DC_FREQ_650kHZ = 2,
-};
-
-enum ad5755_dc_dc_maxv {
-	AD5755_DC_DC_MAXV_23V	= 0,
-	AD5755_DC_DC_MAXV_24V5	= 1,
-	AD5755_DC_DC_MAXV_27V	= 2,
-	AD5755_DC_DC_MAXV_29V5	= 3,
-};
-
-enum ad5755_slew_rate {
-	AD5755_SLEW_RATE_64k	= 0,
-	AD5755_SLEW_RATE_32k	= 1,
-	AD5755_SLEW_RATE_16k	= 2,
-	AD5755_SLEW_RATE_8k	= 3,
-	AD5755_SLEW_RATE_4k	= 4,
-	AD5755_SLEW_RATE_2k	= 5,
-	AD5755_SLEW_RATE_1k	= 6,
-	AD5755_SLEW_RATE_500	= 7,
-	AD5755_SLEW_RATE_250	= 8,
-	AD5755_SLEW_RATE_125	= 9,
-	AD5755_SLEW_RATE_64	= 10,
-	AD5755_SLEW_RATE_32	= 11,
-	AD5755_SLEW_RATE_16	= 12,
-	AD5755_SLEW_RATE_8	= 13,
-	AD5755_SLEW_RATE_4	= 14,
-	AD5755_SLEW_RATE_0_5	= 15,
-};
-
-enum ad5755_slew_step_size {
-	AD5755_SLEW_STEP_SIZE_1 = 0,
-	AD5755_SLEW_STEP_SIZE_2 = 1,
-	AD5755_SLEW_STEP_SIZE_4 = 2,
-	AD5755_SLEW_STEP_SIZE_8 = 3,
-	AD5755_SLEW_STEP_SIZE_16 = 4,
-	AD5755_SLEW_STEP_SIZE_32 = 5,
-	AD5755_SLEW_STEP_SIZE_64 = 6,
-	AD5755_SLEW_STEP_SIZE_128 = 7,
-	AD5755_SLEW_STEP_SIZE_256 = 8,
-};
-
-/**
- * struct ad5755_platform_data - AD5755 DAC driver platform data
- * @ext_dc_dc_compenstation_resistor: Whether an external DC-DC converter
- * compensation register is used.
- * @dc_dc_phase: DC-DC converter phase.
- * @dc_dc_freq: DC-DC converter frequency.
- * @dc_dc_maxv: DC-DC maximum allowed boost voltage.
- * @dac.mode: The mode to be used for the DAC output.
- * @dac.ext_current_sense_resistor: Whether an external current sense resistor
- * is used.
- * @dac.enable_voltage_overrange: Whether to enable 20% voltage output overrange.
- * @dac.slew.enable: Whether to enable digital slew.
- * @dac.slew.rate: Slew rate of the digital slew.
- * @dac.slew.step_size: Slew step size of the digital slew.
- **/
-struct ad5755_platform_data {
-	bool ext_dc_dc_compenstation_resistor;
-	enum ad5755_dc_dc_phase dc_dc_phase;
-	enum ad5755_dc_dc_freq dc_dc_freq;
-	enum ad5755_dc_dc_maxv dc_dc_maxv;
-
-	struct {
-		enum ad5755_mode mode;
-		bool ext_current_sense_resistor;
-		bool enable_voltage_overrange;
-		struct {
-			bool enable;
-			enum ad5755_slew_rate rate;
-			enum ad5755_slew_step_size step_size;
-		} slew;
-	} dac[4];
-};
-
-#endif
-- 
cgit v1.2.3


From c537be0bfad6337f2afd618fe252c03217191405 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 3 Dec 2021 11:28:46 +0100
Subject: i2c: acpi: Add i2c_acpi_new_device_by_fwnode() function

Change i2c_acpi_new_device() into i2c_acpi_new_device_by_fwnode() and
add a static inline wrapper providing the old i2c_acpi_new_device()
behavior.

This is necessary because in some cases we may only have access
to the fwnode / acpi_device and not to the matching physical-node
struct device *.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Wolfram Sang <wsa@kernel.org>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20211203102857.44539-4-hdegoede@redhat.com
---
 include/linux/i2c.h | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 16119ac1aa97..7d4f52ceb7b5 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -1025,8 +1025,9 @@ bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 			       struct acpi_resource_i2c_serialbus **i2c);
 int i2c_acpi_client_count(struct acpi_device *adev);
 u32 i2c_acpi_find_bus_speed(struct device *dev);
-struct i2c_client *i2c_acpi_new_device(struct device *dev, int index,
-				       struct i2c_board_info *info);
+struct i2c_client *i2c_acpi_new_device_by_fwnode(struct fwnode_handle *fwnode,
+						 int index,
+						 struct i2c_board_info *info);
 struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle);
 bool i2c_acpi_waive_d0_probe(struct device *dev);
 #else
@@ -1043,8 +1044,9 @@ static inline u32 i2c_acpi_find_bus_speed(struct device *dev)
 {
 	return 0;
 }
-static inline struct i2c_client *i2c_acpi_new_device(struct device *dev,
-					int index, struct i2c_board_info *info)
+static inline struct i2c_client *i2c_acpi_new_device_by_fwnode(
+					struct fwnode_handle *fwnode, int index,
+					struct i2c_board_info *info)
 {
 	return ERR_PTR(-ENODEV);
 }
@@ -1058,4 +1060,11 @@ static inline bool i2c_acpi_waive_d0_probe(struct device *dev)
 }
 #endif /* CONFIG_ACPI */
 
+static inline struct i2c_client *i2c_acpi_new_device(struct device *dev,
+						     int index,
+						     struct i2c_board_info *info)
+{
+	return i2c_acpi_new_device_by_fwnode(dev_fwnode(dev), index, info);
+}
+
 #endif /* _LINUX_I2C_H */
-- 
cgit v1.2.3


From 9dfa374cc6d04d2515adc21c39e356b64ee45a29 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 3 Dec 2021 11:28:47 +0100
Subject: platform_data: Add linux/platform_data/tps68470.h file

The clk and regulator frameworks expect clk/regulator consumer-devices
to have info about the consumed clks/regulators described in the device's
fw_node.

To work around cases where this info is not present in the firmware tables,
which is often the case on x86/ACPI devices, both frameworks allow the
provider-driver to attach info about consumers to the provider-device
during probe/registration of the provider device.

The TI TPS68470 PMIC is used x86/ACPI devices with the consumer-info
missing from the ACPI tables. Thus the tps68470-clk and tps68470-regulator
drivers must provide the consumer-info at probe time.

Define tps68470_clk_platform_data and tps68470_regulator_platform_data
structs to allow the x86 platform code to pass the necessary consumer info
to these drivers.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20211203102857.44539-5-hdegoede@redhat.com
---
 include/linux/platform_data/tps68470.h | 35 ++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 include/linux/platform_data/tps68470.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/tps68470.h b/include/linux/platform_data/tps68470.h
new file mode 100644
index 000000000000..126d082c3f2e
--- /dev/null
+++ b/include/linux/platform_data/tps68470.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * TI TPS68470 PMIC platform data definition.
+ *
+ * Copyright (c) 2021 Red Hat Inc.
+ *
+ * Red Hat authors:
+ * Hans de Goede <hdegoede@redhat.com>
+ */
+#ifndef __PDATA_TPS68470_H
+#define __PDATA_TPS68470_H
+
+enum tps68470_regulators {
+	TPS68470_CORE,
+	TPS68470_ANA,
+	TPS68470_VCM,
+	TPS68470_VIO,
+	TPS68470_VSIO,
+	TPS68470_AUX1,
+	TPS68470_AUX2,
+	TPS68470_NUM_REGULATORS
+};
+
+struct regulator_init_data;
+
+struct tps68470_regulator_platform_data {
+	const struct regulator_init_data *reg_init_data[TPS68470_NUM_REGULATORS];
+};
+
+struct tps68470_clk_platform_data {
+	const char *consumer_dev_name;
+	const char *consumer_con_id;
+};
+
+#endif
-- 
cgit v1.2.3


From 3c118547f87e930d45a5787e386734015dd93b32 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 10 Dec 2021 21:29:59 +0100
Subject: u64_stats: Disable preemption on 32bit UP+SMP PREEMPT_RT during
 updates.

On PREEMPT_RT the seqcount_t for synchronisation is required on 32bit
architectures even on UP because the softirq (and the threaded IRQ handler) can
be preempted.

With the seqcount_t for synchronisation, a reader with higher priority can
preempt the writer and then spin endlessly in read_seqcount_begin() while the
writer can't make progress.

To avoid such a lock up on PREEMPT_RT the writer must disable preemption during
the update. There is no need to disable interrupts because no writer is using
this API in hard-IRQ context on PREEMPT_RT.

Disable preemption on 32bit-RT within the u64_stats write section.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/u64_stats_sync.h | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index e8ec116c916b..6ad4e9032d53 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -66,7 +66,7 @@
 #include <linux/seqlock.h>
 
 struct u64_stats_sync {
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
 	seqcount_t	seq;
 #endif
 };
@@ -125,7 +125,7 @@ static inline void u64_stats_inc(u64_stats_t *p)
 }
 #endif
 
-#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
 #define u64_stats_init(syncp)	seqcount_init(&(syncp)->seq)
 #else
 static inline void u64_stats_init(struct u64_stats_sync *syncp)
@@ -135,15 +135,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp)
 
 static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
 	write_seqcount_begin(&syncp->seq);
 #endif
 }
 
 static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
 	write_seqcount_end(&syncp->seq);
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable();
 #endif
 }
 
@@ -152,8 +156,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
 {
 	unsigned long flags = 0;
 
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	local_irq_save(flags);
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
+	else
+		local_irq_save(flags);
 	write_seqcount_begin(&syncp->seq);
 #endif
 	return flags;
@@ -163,15 +170,18 @@ static inline void
 u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
 				unsigned long flags)
 {
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
 	write_seqcount_end(&syncp->seq);
-	local_irq_restore(flags);
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable();
+	else
+		local_irq_restore(flags);
 #endif
 }
 
 static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
 	return read_seqcount_begin(&syncp->seq);
 #else
 	return 0;
@@ -180,7 +190,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *
 
 static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
 	preempt_disable();
 #endif
 	return __u64_stats_fetch_begin(syncp);
@@ -189,7 +199,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy
 static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
 	return read_seqcount_retry(&syncp->seq, start);
 #else
 	return false;
@@ -199,7 +209,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
 	preempt_enable();
 #endif
 	return __u64_stats_fetch_retry(syncp, start);
@@ -213,7 +223,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
  */
 static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
+	preempt_disable();
+#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
 	local_irq_disable();
 #endif
 	return __u64_stats_fetch_begin(syncp);
@@ -222,7 +234,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync
 static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
 					     unsigned int start)
 {
-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
+	preempt_enable();
+#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
 	local_irq_enable();
 #endif
 	return __u64_stats_fetch_retry(syncp, start);
-- 
cgit v1.2.3


From 4bc5e64e6cf37007e436970024e5998ee0935651 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javierm@redhat.com>
Date: Fri, 26 Nov 2021 01:13:32 +0100
Subject: efi: Move efifb_setup_from_dmi() prototype from arch headers

Commit 8633ef82f101 ("drivers/firmware: consolidate EFI framebuffer setup
for all arches") made the Generic System Framebuffers (sysfb) driver able
to be built on non-x86 architectures.

But it left the efifb_setup_from_dmi() function prototype declaration in
the architecture specific headers. This could lead to the following
compiler warning as reported by the kernel test robot:

   drivers/firmware/efi/sysfb_efi.c:70:6: warning: no previous prototype for function 'efifb_setup_from_dmi' [-Wmissing-prototypes]
   void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
        ^
   drivers/firmware/efi/sysfb_efi.c:70:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void efifb_setup_from_dmi(struct screen_info *si, const char *opt)

Fixes: 8633ef82f101 ("drivers/firmware: consolidate EFI framebuffer setup for all arches")
Reported-by: kernel test robot <lkp@intel.com>
Cc: <stable@vger.kernel.org> # 5.15.x
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: https://lore.kernel.org/r/20211126001333.555514-1-javierm@redhat.com
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index dbd39b20e034..ef8dbc0a1522 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1283,4 +1283,10 @@ static inline struct efi_mokvar_table_entry *efi_mokvar_entry_find(
 }
 #endif
 
+#ifdef CONFIG_SYSFB
+extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
+#else
+static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) { }
+#endif
+
 #endif /* _LINUX_EFI_H */
-- 
cgit v1.2.3


From f92c1e183604c20ce00eb889315fdaa8f2d9e509 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 8 Dec 2021 20:32:44 +0100
Subject: bpf: Add get_func_[arg|ret|arg_cnt] helpers

Adding following helpers for tracing programs:

Get n-th argument of the traced function:
  long bpf_get_func_arg(void *ctx, u32 n, u64 *value)

Get return value of the traced function:
  long bpf_get_func_ret(void *ctx, u64 *value)

Get arguments count of the traced function:
  long bpf_get_func_arg_cnt(void *ctx)

The trampoline now stores number of arguments on ctx-8
address, so it's easy to verify argument index and find
return value argument's position.

Moving function ip address on the trampoline stack behind
the number of functions arguments, so it's now stored on
ctx-16 address if it's needed.

All helpers above are inlined by verifier.

Also bit unrelated small change - using newly added function
bpf_prog_has_trampoline in check_get_func_ip.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211208193245.172141-5-jolsa@kernel.org
---
 include/linux/bpf.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7a40022e3d00..965fffaf0308 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -777,6 +777,7 @@ void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
 int bpf_jit_charge_modmem(u32 pages);
 void bpf_jit_uncharge_modmem(u32 pages);
+bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
 #else
 static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
 					   struct bpf_trampoline *tr)
@@ -805,6 +806,10 @@ static inline bool is_bpf_image_address(unsigned long address)
 {
 	return false;
 }
+static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
+{
+	return false;
+}
 #endif
 
 struct bpf_func_info_aux {
-- 
cgit v1.2.3


From 0e25498f8cd43c1b5aa327f373dd094e9a006da7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 28 Jun 2021 14:52:01 -0500
Subject: exit: Add and use make_task_dead.

There are two big uses of do_exit.  The first is it's design use to be
the guts of the exit(2) system call.  The second use is to terminate
a task after something catastrophic has happened like a NULL pointer
in kernel code.

Add a function make_task_dead that is initialy exactly the same as
do_exit to cover the cases where do_exit is called to handle
catastrophic failure.  In time this can probably be reduced to just a
light wrapper around do_task_dead. For now keep it exactly the same so
that there will be no behavioral differences introducing this new
concept.

Replace all of the uses of do_exit that use it for catastraphic
task cleanup with make_task_dead to make it clear what the code
is doing.

As part of this rename rewind_stack_do_exit
rewind_stack_and_make_dead.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/task.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ba88a6987400..2d4bbd9c3278 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -59,6 +59,7 @@ extern void sched_post_fork(struct task_struct *p,
 extern void sched_dead(struct task_struct *p);
 
 void __noreturn do_task_dead(void);
+void __noreturn make_task_dead(int signr);
 
 extern void proc_caches_init(void);
 
-- 
cgit v1.2.3


From bbda86e988d4c124e4cfa816291cbd583ae8bfb1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 22 Nov 2021 10:27:36 -0600
Subject: exit: Implement kthread_exit

The way the per task_struct exit_code is used by kernel threads is not
quite compatible how it is used by userspace applications.  The low
byte of the userspace exit_code value encodes the exit signal.  While
kthreads just use the value as an int holding ordinary kernel function
exit status like -EPERM.

Add kthread_exit to clearly separate the two kinds of uses.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/kthread.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 346b0f269161..22c43d419687 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -70,6 +70,7 @@ void *kthread_probe_data(struct task_struct *k);
 int kthread_park(struct task_struct *k);
 void kthread_unpark(struct task_struct *k);
 void kthread_parkme(void);
+void kthread_exit(long result) __noreturn;
 
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
-- 
cgit v1.2.3


From ca3574bd653aba234a4b31955f2778947403be16 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Dec 2021 11:00:19 -0600
Subject: exit: Rename module_put_and_exit to module_put_and_kthread_exit

Update module_put_and_exit to call kthread_exit instead of do_exit.

Change the name to reflect this change in functionality.  All of the
users of module_put_and_exit are causing the current kthread to exit
so this change makes it clear what is happening.  There is no
functional change.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/module.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index c9f1200b2312..f03be97e9ec1 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -595,9 +595,9 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
-extern void __noreturn __module_put_and_exit(struct module *mod,
+extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
 			long code);
-#define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code)
+#define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)
 
 #ifdef CONFIG_MODULE_UNLOAD
 int module_refcount(struct module *mod);
@@ -790,7 +790,7 @@ static inline int unregister_module_notifier(struct notifier_block *nb)
 	return 0;
 }
 
-#define module_put_and_exit(code) do_exit(code)
+#define module_put_and_kthread_exit(code) kthread_exit(code)
 
 static inline void print_modules(void)
 {
-- 
cgit v1.2.3


From cead18552660702a4a46f58e65188fe5f36e9dfe Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 22 Nov 2021 11:15:19 -0600
Subject: exit: Rename complete_and_exit to kthread_complete_and_exit

Update complete_and_exit to call kthread_exit instead of do_exit.

Change the name to reflect this change in functionality.  All of the
users of complete_and_exit are causing the current kthread to exit so
this change makes it clear what is happening.

Move the implementation of kthread_complete_and_exit from
kernel/exit.c to to kernel/kthread.c.  As this function is kthread
specific it makes most sense to live with the kthread functions.

There are no functional change.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/kernel.h  | 1 -
 include/linux/kthread.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 77755ac3e189..055eb203c00e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -187,7 +187,6 @@ static inline void might_fault(void) { }
 #endif
 
 void do_exit(long error_code) __noreturn;
-void complete_and_exit(struct completion *, long) __noreturn;
 
 extern int num_to_str(char *buf, int size,
 		      unsigned long long num, unsigned int width);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 22c43d419687..d86a7e3b9a52 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -71,6 +71,7 @@ int kthread_park(struct task_struct *k);
 void kthread_unpark(struct task_struct *k);
 void kthread_parkme(void);
 void kthread_exit(long result) __noreturn;
+void kthread_complete_and_exit(struct completion *, long) __noreturn;
 
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
-- 
cgit v1.2.3


From 40966e316f86b8cfd83abd31ccb4df729309d3e7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 2 Dec 2021 09:56:14 -0600
Subject: kthread: Ensure struct kthread is present for all kthreads

Today the rules are a bit iffy and arbitrary about which kernel
threads have struct kthread present.  Both idle threads and thread
started with create_kthread want struct kthread present so that is
effectively all kernel threads.  Make the rule that if PF_KTHREAD
and the task is running then struct kthread is present.

This will allow the kernel thread code to using tsk->exit_code
with different semantics from ordinary processes.

To make ensure that struct kthread is present for all
kernel threads move it's allocation into copy_process.

Add a deallocation of struct kthread in exec for processes
that were kernel threads.

Move the allocation of struct kthread for the initial thread
earlier so that it is not repeated for each additional idle
thread.

Move the initialization of struct kthread into set_kthread_struct
so that the structure is always and reliably initailized.

Clear set_child_tid in free_kthread_struct to ensure the kthread
struct is reliably freed during exec.  The function
free_kthread_struct does not need to clear vfork_done during exec as
exec_mm_release called from exec_mmap has already cleared vfork_done.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/kthread.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index d86a7e3b9a52..4f3433afb54b 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -33,7 +33,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 					  unsigned int cpu,
 					  const char *namefmt);
 
-void set_kthread_struct(struct task_struct *p);
+bool set_kthread_struct(struct task_struct *p);
 
 void kthread_set_per_cpu(struct task_struct *k, int cpu);
 bool kthread_is_per_cpu(struct task_struct *k);
-- 
cgit v1.2.3


From df5e49c880ea0776806b8a9f8ab95e035272cf6f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Nov 2021 15:51:25 +1100
Subject: SUNRPC: change svc_get() to return the svc.

It is common for 'get' functions to return the object that was 'got',
and there are a couple of places where users of svc_get() would be a
little simpler if svc_get() did that.

Make it so.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 0ae28ae6caf2..5d9568953fcd 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -120,9 +120,10 @@ struct svc_serv {
  * change the number of threads.  Horrible, but there it is.
  * Should be called with the "service mutex" held.
  */
-static inline void svc_get(struct svc_serv *serv)
+static inline struct svc_serv *svc_get(struct svc_serv *serv)
 {
 	serv->sv_nrthreads++;
+	return serv;
 }
 
 /*
-- 
cgit v1.2.3


From 8c62d12740a1450d2e8456d5747f440e10db281a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Nov 2021 15:51:25 +1100
Subject: SUNRPC/NFSD: clean up get/put functions.

svc_destroy() is poorly named - it doesn't necessarily destroy the svc,
it might just reduce the ref count.
nfsd_destroy() is poorly named for the same reason.

This patch:
 - removes the refcount functionality from svc_destroy(), moving it to
   a new svc_put().  Almost all previous callers of svc_destroy() now
   call svc_put().
 - renames nfsd_destroy() to nfsd_put() and improves the code, using
   the new svc_destroy() rather than svc_put()
 - removes a few comments that explain the important for balanced
   get/put calls.  This should be obvious.

The only non-trivial part of this is that svc_destroy() would call
svc_sock_update() on a non-final decrement.  It can no longer do that,
and svc_put() isn't really a good place of it.  This call is now made
from svc_exit_thread() which seems like a good place.  This makes the
call *before* sv_nrthreads is decremented rather than after.  This
is not particularly important as the call just sets a flag which
causes sv_nrthreads set be checked later.  A subsequent patch will
improve the ordering.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 5d9568953fcd..73d56d33a36d 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -114,8 +114,13 @@ struct svc_serv {
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 };
 
-/*
- * We use sv_nrthreads as a reference count.  svc_destroy() drops
+/**
+ * svc_get() - increment reference count on a SUNRPC serv
+ * @serv:  the svc_serv to have count incremented
+ *
+ * Returns: the svc_serv that was passed in.
+ *
+ * We use sv_nrthreads as a reference count.  svc_put() drops
  * this refcount, so we need to bump it up around operations that
  * change the number of threads.  Horrible, but there it is.
  * Should be called with the "service mutex" held.
@@ -126,6 +131,22 @@ static inline struct svc_serv *svc_get(struct svc_serv *serv)
 	return serv;
 }
 
+void svc_destroy(struct svc_serv *serv);
+
+/**
+ * svc_put - decrement reference count on a SUNRPC serv
+ * @serv:  the svc_serv to have count decremented
+ *
+ * When the reference count reaches zero, svc_destroy()
+ * is called to clean up and free the serv.
+ */
+static inline void svc_put(struct svc_serv *serv)
+{
+	serv->sv_nrthreads -= 1;
+	if (serv->sv_nrthreads == 0)
+		svc_destroy(serv);
+}
+
 /*
  * Maximum payload size supported by a kernel RPC server.
  * This is use to determine the max number of pages nfsd is
@@ -515,7 +536,6 @@ struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
-void		   svc_destroy(struct svc_serv *);
 void		   svc_shutdown_net(struct svc_serv *, struct net *);
 int		   svc_process(struct svc_rqst *);
 int		   bc_svc_process(struct svc_serv *, struct rpc_rqst *,
-- 
cgit v1.2.3


From ec52361df99b490f6af412b046df9799b92c1050 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Nov 2021 15:51:25 +1100
Subject: SUNRPC: stop using ->sv_nrthreads as a refcount

The use of sv_nrthreads as a general refcount results in clumsy code, as
is seen by various comments needed to explain the situation.

This patch introduces a 'struct kref' and uses that for reference
counting, leaving sv_nrthreads to be a pure count of threads.  The kref
is managed particularly in svc_get() and svc_put(), and also nfsd_put();

svc_destroy() now takes a pointer to the embedded kref, rather than to
the serv.

nfsd allows the svc_serv to exist with ->sv_nrhtreads being zero.  This
happens when a transport is created before the first thread is started.
To support this, a 'keep_active' flag is introduced which holds a ref on
the svc_serv.  This is set when any listening socket is successfully
added (unless there are running threads), and cleared when the number of
threads is set.  So when the last thread exits, the nfs_serv will be
destroyed.
The use of 'keep_active' replaces previous code which checked if there
were any permanent sockets.

We no longer clear ->rq_server when nfsd() exits.  This was done
to prevent svc_exit_thread() from calling svc_destroy().
Instead we take an extra reference to the svc_serv to prevent
svc_destroy() from being called.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 73d56d33a36d..3903b4ae8ac5 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -85,6 +85,7 @@ struct svc_serv {
 	struct svc_program *	sv_program;	/* RPC program */
 	struct svc_stat *	sv_stats;	/* RPC statistics */
 	spinlock_t		sv_lock;
+	struct kref		sv_refcnt;
 	unsigned int		sv_nrthreads;	/* # of server threads */
 	unsigned int		sv_maxconn;	/* max connections allowed or
 						 * '0' causing max to be based
@@ -119,19 +120,14 @@ struct svc_serv {
  * @serv:  the svc_serv to have count incremented
  *
  * Returns: the svc_serv that was passed in.
- *
- * We use sv_nrthreads as a reference count.  svc_put() drops
- * this refcount, so we need to bump it up around operations that
- * change the number of threads.  Horrible, but there it is.
- * Should be called with the "service mutex" held.
  */
 static inline struct svc_serv *svc_get(struct svc_serv *serv)
 {
-	serv->sv_nrthreads++;
+	kref_get(&serv->sv_refcnt);
 	return serv;
 }
 
-void svc_destroy(struct svc_serv *serv);
+void svc_destroy(struct kref *);
 
 /**
  * svc_put - decrement reference count on a SUNRPC serv
@@ -142,9 +138,7 @@ void svc_destroy(struct svc_serv *serv);
  */
 static inline void svc_put(struct svc_serv *serv)
 {
-	serv->sv_nrthreads -= 1;
-	if (serv->sv_nrthreads == 0)
-		svc_destroy(serv);
+	kref_put(&serv->sv_refcnt, svc_destroy);
 }
 
 /*
-- 
cgit v1.2.3


From 3409e4f1e8f239f0ed81be0b068ecf4e73e2e826 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Nov 2021 15:51:25 +1100
Subject: NFSD: Make it possible to use svc_set_num_threads_sync

nfsd cannot currently use svc_set_num_threads_sync.  It instead
uses svc_set_num_threads which does *not* wait for threads to all
exit, and has a separate mechanism (nfsd_shutdown_complete) to wait
for completion.

The reason that nfsd is unlike other services is that nfsd threads can
exit separately from svc_set_num_threads being called - they die on
receipt of SIGKILL.  Also, when the last thread exits, the service must
be shut down (sockets closed).

For this, the nfsd_mutex needs to be taken, and as that mutex needs to
be held while svc_set_num_threads is called, the one cannot wait for
the other.

This patch changes the nfsd thread so that it can drop the ref on the
service without blocking on nfsd_mutex, so that svc_set_num_threads_sync
can be used:
 - if it can drop a non-last reference, it does that.  This does not
   trigger shutdown and does not require a mutex.  This will likely
   happen for all but the last thread signalled, and for all threads
   being shut down by nfsd_shutdown_threads()
 - if it can get the mutex without blocking (trylock), it does that
   and then drops the reference.  This will likely happen for the
   last thread killed by SIGKILL
 - Otherwise there might be an unrelated task holding the mutex,
   possibly in another network namespace, or nfsd_shutdown_threads()
   might be just about to get a reference on the service, after which
   we can drop ours safely.
   We cannot conveniently get wakeup notifications on these events,
   and we are unlikely to need to, so we sleep briefly and check again.

With this we can discard nfsd_shutdown_complete and
nfsd_complete_shutdown(), and switch to svc_set_num_threads_sync.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3903b4ae8ac5..36bfc0281988 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -141,6 +141,19 @@ static inline void svc_put(struct svc_serv *serv)
 	kref_put(&serv->sv_refcnt, svc_destroy);
 }
 
+/**
+ * svc_put_not_last - decrement non-final reference count on SUNRPC serv
+ * @serv:  the svc_serv to have count decremented
+ *
+ * Returns: %true is refcount was decremented.
+ *
+ * If the refcount is 1, it is not decremented and instead failure is reported.
+ */
+static inline bool svc_put_not_last(struct svc_serv *serv)
+{
+	return refcount_dec_not_one(&serv->sv_refcnt.refcount);
+}
+
 /*
  * Maximum payload size supported by a kernel RPC server.
  * This is use to determine the max number of pages nfsd is
-- 
cgit v1.2.3


From 3ebdbe5203a874614819700d3f470724cb803709 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Nov 2021 15:51:25 +1100
Subject: SUNRPC: discard svo_setup and rename svc_set_num_threads_sync()

The ->svo_setup callback serves no purpose.  It is always called from
within the same module that chooses which callback is needed.  So
discard it and call the relevant function directly.

Now that svc_set_num_threads() is no longer used remove it and rename
svc_set_num_threads_sync() to remove the "_sync" suffix.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 36bfc0281988..0b38c6eaf985 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -64,9 +64,6 @@ struct svc_serv_ops {
 	/* queue up a transport for servicing */
 	void		(*svo_enqueue_xprt)(struct svc_xprt *);
 
-	/* set up thread (or whatever) execution context */
-	int		(*svo_setup)(struct svc_serv *, struct svc_pool *, int);
-
 	/* optional module to count when adding threads (pooled svcs only) */
 	struct module	*svo_module;
 };
@@ -541,7 +538,6 @@ void		   svc_pool_map_put(void);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 			const struct svc_serv_ops *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
-int		   svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_shutdown_net(struct svc_serv *, struct net *);
 int		   svc_process(struct svc_rqst *);
-- 
cgit v1.2.3


From cf0e124e0a489944d08fcc3c694d2b234d2cc658 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Nov 2021 15:51:25 +1100
Subject: SUNRPC: move the pool_map definitions (back) into svc.c

These definitions are not used outside of svc.c, and there is no
evidence that they ever have been.  So move them into svc.c
and make the declarations 'static'.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 0b38c6eaf985..d69e6108cb83 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -494,29 +494,6 @@ struct svc_procedure {
 	const char *		pc_name;	/* for display */
 };
 
-/*
- * Mode for mapping cpus to pools.
- */
-enum {
-	SVC_POOL_AUTO = -1,	/* choose one of the others */
-	SVC_POOL_GLOBAL,	/* no mapping, just a single global pool
-				 * (legacy & UP mode) */
-	SVC_POOL_PERCPU,	/* one pool per cpu */
-	SVC_POOL_PERNODE	/* one pool per numa node */
-};
-
-struct svc_pool_map {
-	int count;			/* How many svc_servs use us */
-	int mode;			/* Note: int not enum to avoid
-					 * warnings about "enumeration value
-					 * not handled in switch" */
-	unsigned int npools;
-	unsigned int *pool_to;		/* maps pool id to cpu or node */
-	unsigned int *to_pool;		/* maps cpu or node to pool id */
-};
-
-extern struct svc_pool_map svc_pool_map;
-
 /*
  * Function prototypes.
  */
@@ -533,8 +510,6 @@ void		   svc_rqst_replace_page(struct svc_rqst *rqstp,
 					 struct page *page);
 void		   svc_rqst_free(struct svc_rqst *);
 void		   svc_exit_thread(struct svc_rqst *);
-unsigned int	   svc_pool_map_get(void);
-void		   svc_pool_map_put(void);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 			const struct svc_serv_ops *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
-- 
cgit v1.2.3


From 6b044fbaab02292fedb17565dbb3f2528083b169 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Nov 2021 15:51:25 +1100
Subject: lockd: use svc_set_num_threads() for thread start and stop

svc_set_num_threads() does everything that lockd_start_svc() does, except
set sv_maxconn.  It also (when passed 0) finds the threads and
stops them with kthread_stop().

So move the setting for sv_maxconn, and use svc_set_num_thread()

We now don't need nlmsvc_task.

Now that we use svc_set_num_threads() it makes sense to set svo_module.
This request that the thread exists with module_put_and_exit().
Also fix the documentation for svo_module to make this explicit.

svc_prepare_thread is now only used where it is defined, so it can be
made static.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index d69e6108cb83..cf175d47c6b7 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -64,7 +64,9 @@ struct svc_serv_ops {
 	/* queue up a transport for servicing */
 	void		(*svo_enqueue_xprt)(struct svc_xprt *);
 
-	/* optional module to count when adding threads (pooled svcs only) */
+	/* optional module to count when adding threads.
+	 * Thread function must call module_put_and_exit() to exit.
+	 */
 	struct module	*svo_module;
 };
 
@@ -504,8 +506,6 @@ struct svc_serv *svc_create(struct svc_program *, unsigned int,
 			    const struct svc_serv_ops *);
 struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv,
 					struct svc_pool *pool, int node);
-struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
-					struct svc_pool *pool, int node);
 void		   svc_rqst_replace_page(struct svc_rqst *rqstp,
 					 struct page *page);
 void		   svc_rqst_free(struct svc_rqst *);
-- 
cgit v1.2.3


From c8064e5b4adac5e1255cf4f3b374e75b5376e7ca Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 30 Nov 2021 11:08:07 +0100
Subject: bpf: Let bpf_warn_invalid_xdp_action() report more info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In non trivial scenarios, the action id alone is not sufficient to
identify the program causing the warning. Before the previous patch,
the generated stack-trace pointed out at least the involved device
driver.

Let's additionally include the program name and id, and the relevant
device name.

If the user needs additional infos, he can fetch them via a kernel
probe, leveraging the arguments added here.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/ddb96bb975cbfddb1546cf5da60e77d5100b533c.1638189075.git.pabeni@redhat.com
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 68c6b5c208e7..60eec80fa1d4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1027,7 +1027,7 @@ void xdp_do_flush(void);
  */
 #define xdp_do_flush_map xdp_do_flush
 
-void bpf_warn_invalid_xdp_action(u32 act);
+void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act);
 
 #ifdef CONFIG_INET
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
-- 
cgit v1.2.3


From 22c3f2f56bd9c901e1da69040680c311f310635d Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Wed, 1 Dec 2021 11:36:18 -0800
Subject: net/mlx5: Separate FDB namespace

This patch doesn't add an additional namespaces, but just separates the
naming to be used by each FDB user, bypass and kernel.
Downstream patches will actually split this up and allow to have more
than single priority for the bypass users.

Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Acked-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index cd2d4c572367..b1aad14689e3 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -73,6 +73,7 @@ enum mlx5_flow_namespace_type {
 	MLX5_FLOW_NAMESPACE_KERNEL,
 	MLX5_FLOW_NAMESPACE_LEFTOVERS,
 	MLX5_FLOW_NAMESPACE_ANCHOR,
+	MLX5_FLOW_NAMESPACE_FDB_BYPASS,
 	MLX5_FLOW_NAMESPACE_FDB,
 	MLX5_FLOW_NAMESPACE_ESW_EGRESS,
 	MLX5_FLOW_NAMESPACE_ESW_INGRESS,
-- 
cgit v1.2.3


From 8aa45b544db9788b0fcc7a300eba8a3ade5d3d50 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Fri, 10 Dec 2021 13:11:34 +0200
Subject: HID: Add map_msc() to avoid boilerplate code

Since we are going to have more MSC events too, add map_msc() that can
be used to fill in necessary fields and avoid boilerplate code.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20211210111138.1248187-2-tero.kristo@linux.intel.com
---
 include/linux/hid.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index b2fea7fc54a1..f18e2cf5d74d 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -1010,6 +1010,10 @@ static inline void hid_map_usage(struct hid_input *hidinput,
 		bmap = input->ledbit;
 		limit = LED_MAX;
 		break;
+	case EV_MSC:
+		bmap = input->mscbit;
+		limit = MSC_MAX;
+		break;
 	}
 
 	if (unlikely(c > limit || !bmap)) {
-- 
cgit v1.2.3


From ae7fafa6896ae762ff489a5e71c8e5fabfeb61ee Mon Sep 17 00:00:00 2001
From: Tero Kristo <tero.kristo@linux.intel.com>
Date: Fri, 10 Dec 2021 13:11:36 +0200
Subject: HID: Add hid usages for USI style pens

Add usage codes for USI style pens, based on the USB-HID usage table:
    https://usb.org/document-library/hid-usage-tables-122

See chapter 16, Digitizers Page (0x0D)

Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20211210111138.1248187-4-tero.kristo@linux.intel.com
---
 include/linux/hid.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index f18e2cf5d74d..fa2ddda84a53 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -241,6 +241,7 @@ struct hid_item {
 #define HID_DG_TOUCH		0x000d0033
 #define HID_DG_UNTOUCH		0x000d0034
 #define HID_DG_TAP		0x000d0035
+#define HID_DG_TRANSDUCER_INDEX	0x000d0038
 #define HID_DG_TABLETFUNCTIONKEY	0x000d0039
 #define HID_DG_PROGRAMCHANGEKEY	0x000d003a
 #define HID_DG_BATTERYSTRENGTH	0x000d003b
@@ -253,6 +254,15 @@ struct hid_item {
 #define HID_DG_BARRELSWITCH	0x000d0044
 #define HID_DG_ERASER		0x000d0045
 #define HID_DG_TABLETPICK	0x000d0046
+#define HID_DG_PEN_COLOR			0x000d005c
+#define HID_DG_PEN_LINE_WIDTH			0x000d005e
+#define HID_DG_PEN_LINE_STYLE			0x000d0070
+#define HID_DG_PEN_LINE_STYLE_INK		0x000d0072
+#define HID_DG_PEN_LINE_STYLE_PENCIL		0x000d0073
+#define HID_DG_PEN_LINE_STYLE_HIGHLIGHTER	0x000d0074
+#define HID_DG_PEN_LINE_STYLE_CHISEL_MARKER	0x000d0075
+#define HID_DG_PEN_LINE_STYLE_BRUSH		0x000d0076
+#define HID_DG_PEN_LINE_STYLE_NO_PREFERENCE	0x000d0077
 
 #define HID_CP_CONSUMERCONTROL	0x000c0001
 #define HID_CP_NUMERICKEYPAD	0x000c0002
-- 
cgit v1.2.3


From 5904a3f9d756f108279f8c5b8f17ca6ddfb28d24 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Fri, 10 Dec 2021 13:11:37 +0200
Subject: HID: input: Make hidinput_find_field() static

This function is not called outside of hid-input.c so we can make it
static.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20211210111138.1248187-5-tero.kristo@linux.intel.com
---
 include/linux/hid.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index fa2ddda84a53..ff8b6973022a 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -899,7 +899,6 @@ extern void hidinput_disconnect(struct hid_device *);
 
 int hid_set_field(struct hid_field *, unsigned, __s32);
 int hid_input_report(struct hid_device *, int type, u8 *, u32, int);
-int hidinput_find_field(struct hid_device *hid, unsigned int type, unsigned int code, struct hid_field **field);
 struct hid_field *hidinput_get_led_field(struct hid_device *hid);
 unsigned int hidinput_count_leds(struct hid_device *hid);
 __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code);
-- 
cgit v1.2.3


From fd8d135b2c5e88662f2729e034913f183455a667 Mon Sep 17 00:00:00 2001
From: Alistair Francis <alistair@alistair23.me>
Date: Wed, 8 Dec 2021 22:40:43 +1000
Subject: HID: quirks: Allow inverting the absolute X/Y values

Add a HID_QUIRK_X_INVERT/HID_QUIRK_Y_INVERT quirk that can be used
to invert the X/Y values.

Signed-off-by: Alistair Francis <alistair@alistair23.me>
[bentiss: silence checkpatch warning]
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20211208124045.61815-2-alistair@alistair23.me
---
 include/linux/hid.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index ff8b6973022a..a14c089eb0c1 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -359,6 +359,8 @@ struct hid_item {
 /* BIT(9) reserved for backward compatibility, was NO_INIT_INPUT_REPORTS */
 #define HID_QUIRK_ALWAYS_POLL			BIT(10)
 #define HID_QUIRK_INPUT_PER_APP			BIT(11)
+#define HID_QUIRK_X_INVERT			BIT(12)
+#define HID_QUIRK_Y_INVERT			BIT(13)
 #define HID_QUIRK_SKIP_OUTPUT_REPORTS		BIT(16)
 #define HID_QUIRK_SKIP_OUTPUT_REPORT_ID		BIT(17)
 #define HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP	BIT(18)
-- 
cgit v1.2.3


From 369461ce8fb6c8156206c7110d7da48e9fbc41bb Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 8 Dec 2021 14:11:20 -0600
Subject: x86: perf: Move RDPMC event flag to a common definition

In preparation to enable user counter access on arm64 and to move some
of the user access handling to perf core, create a common event flag for
user counter access and convert x86 to use it.

Since the architecture specific flags start at the LSB, starting at the
MSB for common flags.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-perf-users@vger.kernel.org
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211208201124.310740-2-robh@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf_event.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0dcfd265beed..ba9467972c09 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,15 @@ struct hw_perf_event_extra {
 	int		idx;	/* index in shared_regs->regs[] */
 };
 
+/**
+ * hw_perf_event::flag values
+ *
+ * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
+ * usage.
+ */
+#define PERF_EVENT_FLAG_ARCH			0x0000ffff
+#define PERF_EVENT_FLAG_USER_READ_CNT		0x80000000
+
 /**
  * struct hw_perf_event - performance event hardware details:
  */
-- 
cgit v1.2.3


From 82ff0c022d19c2ad69a472692bb7ee01ac07a40b Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 8 Dec 2021 14:11:21 -0600
Subject: perf: Add a counter for number of user access events in context

On arm64, user space counter access will be controlled differently
compared to x86. On x86, access in the strictest mode is enabled for all
tasks in an MM when any event is mmap'ed. For arm64, access is
explicitly requested for an event and only enabled when the event's
context is active. This avoids hooks into the arch context switch code
and gives better control of when access is enabled.

In order to configure user space access when the PMU is enabled, it is
necessary to know if any event (currently active or not) in the current
context has user space accessed enabled. Add a counter similar to other
counters in the context to avoid walking the event list every time.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211208201124.310740-3-robh@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ba9467972c09..411e34210fbf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -831,6 +831,7 @@ struct perf_event_context {
 
 	int				nr_events;
 	int				nr_active;
+	int				nr_user;
 	int				is_active;
 	int				nr_stat;
 	int				nr_freq;
-- 
cgit v1.2.3


From 8404b0fbc7fbd42e5c5d28cdedd450e70829c77a Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Thu, 2 Dec 2021 16:06:33 +0800
Subject: drivers/perf: hisi: Add driver for HiSilicon PCIe PMU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported
to sample bandwidth, latency, buffer occupation etc.

Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is
registered as a PMU in /sys/bus/event_source/devices, so users can
select target PMU, and use filter to do further sets.

Filtering options contains:
event     - select the event.
port      - select target Root Ports. Information of Root Ports are
            shown under sysfs.
bdf       - select requester_id of target EP device.
trig_len  - set trigger condition for starting event statistics.
trig_mode - set trigger mode. 0 means starting to statistic when bigger
            than trigger condition, and 1 means smaller.
thr_len   - set threshold for statistics.
thr_mode  - set threshold mode. 0 means count when bigger than threshold,
            and 1 means smaller.

Acked-by: Krzysztof Wilczyński <kw@linux.com>
Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Reviewed-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Link: https://lore.kernel.org/r/20211202080633.2919-3-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/cpuhotplug.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 773c83730906..411a428ace4d 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -225,6 +225,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
+	CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
-- 
cgit v1.2.3


From c8a2a011cd04054a6577d8cf774adf9e09302876 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 14 Dec 2021 03:45:35 +0200
Subject: net: dsa: sja1105: fix broken connection with the sja1110 tagger

The driver was incorrectly converted assuming that "sja1105" is the only
tagger supported by this driver. This results in SJA1110 switches
failing to probe:

sja1105 spi1.0: Unable to connect to tag protocol "sja1110": -EPROTONOSUPPORT
sja1105: probe of spi1.2 failed with error -93

Add DSA_TAG_PROTO_SJA1110 to the list of supported taggers by the
sja1105 driver. The sja1105_tagger_data structure format is common for
the two tagging protocols.

Fixes: c79e84866d2a ("net: dsa: tag_sja1105: convert to tagger-owned data")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index e9cb1ae6d742..159e43171ccc 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -70,7 +70,8 @@ struct sja1105_skb_cb {
 static inline struct sja1105_tagger_data *
 sja1105_tagger_data(struct dsa_switch *ds)
 {
-	BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1105);
+	BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1105 &&
+	       ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1110);
 
 	return ds->tagger_data;
 }
-- 
cgit v1.2.3


From 9280ac2e6f199cddcd746a9ba459136b8666287b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Dec 2021 07:15:15 -0800
Subject: net: dev_replace_track() cleanup

Use existing helpers (netdev_tracker_free()
and netdev_tracker_alloc()) to remove ifdefery.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20211214151515.312535-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 235d5d082f1a..c06e9dc1a317 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3885,16 +3885,14 @@ static inline void dev_replace_track(struct net_device *odev,
 				     netdevice_tracker *tracker,
 				     gfp_t gfp)
 {
-#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
 	if (odev)
-		ref_tracker_free(&odev->refcnt_tracker, tracker);
-#endif
+		netdev_tracker_free(odev, tracker);
+
 	dev_hold(ndev);
 	dev_put(odev);
-#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
+
 	if (ndev)
-		ref_tracker_alloc(&ndev->refcnt_tracker, tracker, gfp);
-#endif
+		netdev_tracker_alloc(ndev, tracker, gfp);
 }
 
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
-- 
cgit v1.2.3


From ad69cd9972e79aba103ba5365de0acd35770c265 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 29 Nov 2021 22:15:27 +0200
Subject: fsnotify: clarify object type argument

In preparation for separating object type from iterator type, rename
some 'type' arguments in functions to 'obj_type' and remove the unused
interface to clear marks by object type mask.

Link: https://lore.kernel.org/r/20211129201537.1932819-2-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify_backend.h | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 51ef2b079bfa..b9c84b1dbcc8 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -338,6 +338,7 @@ static inline struct fs_error_report *fsnotify_data_error_report(
 }
 
 enum fsnotify_obj_type {
+	FSNOTIFY_OBJ_TYPE_ANY = -1,
 	FSNOTIFY_OBJ_TYPE_INODE,
 	FSNOTIFY_OBJ_TYPE_PARENT,
 	FSNOTIFY_OBJ_TYPE_VFSMOUNT,
@@ -346,15 +347,9 @@ enum fsnotify_obj_type {
 	FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
 };
 
-#define FSNOTIFY_OBJ_TYPE_INODE_FL	(1U << FSNOTIFY_OBJ_TYPE_INODE)
-#define FSNOTIFY_OBJ_TYPE_PARENT_FL	(1U << FSNOTIFY_OBJ_TYPE_PARENT)
-#define FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL	(1U << FSNOTIFY_OBJ_TYPE_VFSMOUNT)
-#define FSNOTIFY_OBJ_TYPE_SB_FL		(1U << FSNOTIFY_OBJ_TYPE_SB)
-#define FSNOTIFY_OBJ_ALL_TYPES_MASK	((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1)
-
-static inline bool fsnotify_valid_obj_type(unsigned int type)
+static inline bool fsnotify_valid_obj_type(unsigned int obj_type)
 {
-	return (type < FSNOTIFY_OBJ_TYPE_COUNT);
+	return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT);
 }
 
 struct fsnotify_iter_info {
@@ -387,7 +382,7 @@ static inline void fsnotify_iter_set_report_type_mark(
 static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
 		struct fsnotify_iter_info *iter_info) \
 { \
-	return (iter_info->report_mask & FSNOTIFY_OBJ_TYPE_##NAME##_FL) ? \
+	return (iter_info->report_mask & (1U << FSNOTIFY_OBJ_TYPE_##NAME)) ? \
 		iter_info->marks[FSNOTIFY_OBJ_TYPE_##NAME] : NULL; \
 }
 
@@ -604,11 +599,11 @@ extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn,
 				  __kernel_fsid_t *fsid);
 /* attach the mark to the object */
 extern int fsnotify_add_mark(struct fsnotify_mark *mark,
-			     fsnotify_connp_t *connp, unsigned int type,
+			     fsnotify_connp_t *connp, unsigned int obj_type,
 			     int allow_dups, __kernel_fsid_t *fsid);
 extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 				    fsnotify_connp_t *connp,
-				    unsigned int type, int allow_dups,
+				    unsigned int obj_type, int allow_dups,
 				    __kernel_fsid_t *fsid);
 
 /* attach the mark to the inode */
@@ -637,22 +632,23 @@ extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
 extern void fsnotify_free_mark(struct fsnotify_mark *mark);
 /* Wait until all marks queued for destruction are destroyed */
 extern void fsnotify_wait_marks_destroyed(void);
-/* run all the marks in a group, and clear all of the marks attached to given object type */
-extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type);
+/* Clear all of the marks of a group attached to a given object type */
+extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
+					  unsigned int obj_type);
 /* run all the marks in a group, and clear all of the vfsmount marks */
 static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL);
+	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
 }
 /* run all the marks in a group, and clear all of the inode marks */
 static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE_FL);
+	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE);
 }
 /* run all the marks in a group, and clear all of the sn marks */
 static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB_FL);
+	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB);
 }
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
-- 
cgit v1.2.3


From 1c9007d62bea6fd164285314f7553f73e5308863 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 29 Nov 2021 22:15:28 +0200
Subject: fsnotify: separate mark iterator type from object type enum

They are two different types that use the same enum, so this confusing.

Use the object type to indicate the type of object mark is attached to
and the iter type to indicate the type of watch.

A group can have two different watches of the same object type (parent
and child watches) that match the same event.

Link: https://lore.kernel.org/r/20211129201537.1932819-3-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify_backend.h | 41 +++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index b9c84b1dbcc8..73739fee1710 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -337,10 +337,25 @@ static inline struct fs_error_report *fsnotify_data_error_report(
 	}
 }
 
+/*
+ * Index to merged marks iterator array that correlates to a type of watch.
+ * The type of watched object can be deduced from the iterator type, but not
+ * the other way around, because an event can match different watched objects
+ * of the same object type.
+ * For example, both parent and child are watching an object of type inode.
+ */
+enum fsnotify_iter_type {
+	FSNOTIFY_ITER_TYPE_INODE,
+	FSNOTIFY_ITER_TYPE_VFSMOUNT,
+	FSNOTIFY_ITER_TYPE_SB,
+	FSNOTIFY_ITER_TYPE_PARENT,
+	FSNOTIFY_ITER_TYPE_COUNT
+};
+
+/* The type of object that a mark is attached to */
 enum fsnotify_obj_type {
 	FSNOTIFY_OBJ_TYPE_ANY = -1,
 	FSNOTIFY_OBJ_TYPE_INODE,
-	FSNOTIFY_OBJ_TYPE_PARENT,
 	FSNOTIFY_OBJ_TYPE_VFSMOUNT,
 	FSNOTIFY_OBJ_TYPE_SB,
 	FSNOTIFY_OBJ_TYPE_COUNT,
@@ -353,37 +368,37 @@ static inline bool fsnotify_valid_obj_type(unsigned int obj_type)
 }
 
 struct fsnotify_iter_info {
-	struct fsnotify_mark *marks[FSNOTIFY_OBJ_TYPE_COUNT];
+	struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT];
 	unsigned int report_mask;
 	int srcu_idx;
 };
 
 static inline bool fsnotify_iter_should_report_type(
-		struct fsnotify_iter_info *iter_info, int type)
+		struct fsnotify_iter_info *iter_info, int iter_type)
 {
-	return (iter_info->report_mask & (1U << type));
+	return (iter_info->report_mask & (1U << iter_type));
 }
 
 static inline void fsnotify_iter_set_report_type(
-		struct fsnotify_iter_info *iter_info, int type)
+		struct fsnotify_iter_info *iter_info, int iter_type)
 {
-	iter_info->report_mask |= (1U << type);
+	iter_info->report_mask |= (1U << iter_type);
 }
 
 static inline void fsnotify_iter_set_report_type_mark(
-		struct fsnotify_iter_info *iter_info, int type,
+		struct fsnotify_iter_info *iter_info, int iter_type,
 		struct fsnotify_mark *mark)
 {
-	iter_info->marks[type] = mark;
-	iter_info->report_mask |= (1U << type);
+	iter_info->marks[iter_type] = mark;
+	iter_info->report_mask |= (1U << iter_type);
 }
 
 #define FSNOTIFY_ITER_FUNCS(name, NAME) \
 static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
 		struct fsnotify_iter_info *iter_info) \
 { \
-	return (iter_info->report_mask & (1U << FSNOTIFY_OBJ_TYPE_##NAME)) ? \
-		iter_info->marks[FSNOTIFY_OBJ_TYPE_##NAME] : NULL; \
+	return (iter_info->report_mask & (1U << FSNOTIFY_ITER_TYPE_##NAME)) ? \
+		iter_info->marks[FSNOTIFY_ITER_TYPE_##NAME] : NULL; \
 }
 
 FSNOTIFY_ITER_FUNCS(inode, INODE)
@@ -391,8 +406,8 @@ FSNOTIFY_ITER_FUNCS(parent, PARENT)
 FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
 FSNOTIFY_ITER_FUNCS(sb, SB)
 
-#define fsnotify_foreach_obj_type(type) \
-	for (type = 0; type < FSNOTIFY_OBJ_TYPE_COUNT; type++)
+#define fsnotify_foreach_iter_type(type) \
+	for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++)
 
 /*
  * fsnotify_connp_t is what we embed in objects which connector can be attached
-- 
cgit v1.2.3


From d61fd650e9d206a71fda789f02a1ced4b19944c4 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 29 Nov 2021 22:15:29 +0200
Subject: fanotify: introduce group flag FAN_REPORT_TARGET_FID

FAN_REPORT_FID is ambiguous in that it reports the fid of the child for
some events and the fid of the parent for create/delete/move events.

The new FAN_REPORT_TARGET_FID flag is an implicit request to report
the fid of the target object of the operation (a.k.a the child inode)
also in create/delete/move events in addition to the fid of the parent
and the name of the child.

To reduce the test matrix for uninteresting use cases, the new
FAN_REPORT_TARGET_FID flag requires both FAN_REPORT_NAME and
FAN_REPORT_FID.  The convenience macro FAN_REPORT_DFID_NAME_TARGET
combines FAN_REPORT_TARGET_FID with all the required flags.

Link: https://lore.kernel.org/r/20211129201537.1932819-4-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fanotify.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 616af2ea20f3..376e050e6f38 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -25,7 +25,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
 
 #define FANOTIFY_CLASS_BITS	(FAN_CLASS_NOTIF | FANOTIFY_PERM_CLASSES)
 
-#define FANOTIFY_FID_BITS	(FAN_REPORT_FID | FAN_REPORT_DFID_NAME)
+#define FANOTIFY_FID_BITS	(FAN_REPORT_DFID_NAME_TARGET)
 
 #define FANOTIFY_INFO_MODES	(FANOTIFY_FID_BITS | FAN_REPORT_PIDFD)
 
-- 
cgit v1.2.3


From e54183fa7047c15819bc155f4c58501d9a9a3489 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 29 Nov 2021 22:15:30 +0200
Subject: fsnotify: generate FS_RENAME event with rich information

The dnotify FS_DN_RENAME event is used to request notification about
a move within the same parent directory and was always coupled with
the FS_MOVED_FROM event.

Rename the FS_DN_RENAME event flag to FS_RENAME, decouple it from
FS_MOVED_FROM and report it with the moved dentry instead of the moved
inode, so it has the information about both old and new parent and name.

Generate the FS_RENAME event regardless of same parent dir and apply
the "same parent" rule in the generic fsnotify_handle_event() helper
that is used to call backends with ->handle_inode_event() method
(i.e. dnotify).  The ->handle_inode_event() method is not rich enough to
report both old and new parent and name anyway.

The enriched event is reported to fanotify over the ->handle_event()
method with the old and new dir inode marks in marks array slots for
ITER_TYPE_INODE and a new iter type slot ITER_TYPE_INODE2.

The enriched event will be used for reporting old and new parent+name to
fanotify groups with FAN_RENAME events.

Link: https://lore.kernel.org/r/20211129201537.1932819-5-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/dnotify.h          | 2 +-
 include/linux/fsnotify.h         | 9 ++++++---
 include/linux/fsnotify_backend.h | 7 ++++---
 3 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h
index 0aad774beaec..b87c3b85a166 100644
--- a/include/linux/dnotify.h
+++ b/include/linux/dnotify.h
@@ -26,7 +26,7 @@ struct dnotify_struct {
 			    FS_MODIFY | FS_MODIFY_CHILD |\
 			    FS_ACCESS | FS_ACCESS_CHILD |\
 			    FS_ATTRIB | FS_ATTRIB_CHILD |\
-			    FS_CREATE | FS_DN_RENAME |\
+			    FS_CREATE | FS_RENAME |\
 			    FS_MOVED_FROM | FS_MOVED_TO)
 
 extern int dir_notify_enable;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 787545e87eeb..3a2d7dc3c607 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -144,16 +144,19 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	u32 fs_cookie = fsnotify_get_cookie();
 	__u32 old_dir_mask = FS_MOVED_FROM;
 	__u32 new_dir_mask = FS_MOVED_TO;
+	__u32 rename_mask = FS_RENAME;
 	const struct qstr *new_name = &moved->d_name;
 
-	if (old_dir == new_dir)
-		old_dir_mask |= FS_DN_RENAME;
-
 	if (isdir) {
 		old_dir_mask |= FS_ISDIR;
 		new_dir_mask |= FS_ISDIR;
+		rename_mask |= FS_ISDIR;
 	}
 
+	/* Event with information about both old and new parent+name */
+	fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY,
+		      old_dir, old_name, 0);
+
 	fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
 		      old_dir, old_name, fs_cookie);
 	fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 73739fee1710..790c31844db5 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -63,7 +63,7 @@
  */
 #define FS_EVENT_ON_CHILD	0x08000000
 
-#define FS_DN_RENAME		0x10000000	/* file renamed */
+#define FS_RENAME		0x10000000	/* File was renamed */
 #define FS_DN_MULTISHOT		0x20000000	/* dnotify multishot */
 #define FS_ISDIR		0x40000000	/* event occurred against dir */
 #define FS_IN_ONESHOT		0x80000000	/* only send event once */
@@ -76,7 +76,7 @@
  * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
  * when a directory entry inside a child subdir changes.
  */
-#define ALL_FSNOTIFY_DIRENT_EVENTS	(FS_CREATE | FS_DELETE | FS_MOVE)
+#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)
 
 #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
 				  FS_OPEN_EXEC_PERM)
@@ -101,7 +101,7 @@
 /* Events that can be reported to backends */
 #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
 			     FS_EVENTS_POSS_ON_CHILD | \
-			     FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \
+			     FS_DELETE_SELF | FS_MOVE_SELF | \
 			     FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
 			     FS_ERROR)
 
@@ -349,6 +349,7 @@ enum fsnotify_iter_type {
 	FSNOTIFY_ITER_TYPE_VFSMOUNT,
 	FSNOTIFY_ITER_TYPE_SB,
 	FSNOTIFY_ITER_TYPE_PARENT,
+	FSNOTIFY_ITER_TYPE_INODE2,
 	FSNOTIFY_ITER_TYPE_COUNT
 };
 
-- 
cgit v1.2.3


From 8cc3b1ccd930fe6971e1527f0c4f1bdc8cb56026 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 29 Nov 2021 22:15:37 +0200
Subject: fanotify: wire up FAN_RENAME event

FAN_RENAME is the successor of FAN_MOVED_FROM and FAN_MOVED_TO
and can be used to get the old and new parent+name information in
a single event.

FAN_MOVED_FROM and FAN_MOVED_TO are still supported for backward
compatibility, but it makes little sense to use them together with
FAN_RENAME in the same group.

FAN_RENAME uses special info type records to report the old and
new parent+name, so reporting only old and new parent id is less
useful and was not implemented.
Therefore, FAN_REANAME requires a group with flag FAN_REPORT_NAME.

Link: https://lore.kernel.org/r/20211129201537.1932819-12-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fanotify.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 376e050e6f38..3afdf339d53c 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -82,7 +82,8 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
  * Directory entry modification events - reported only to directory
  * where entry is modified and not to a watching parent.
  */
-#define FANOTIFY_DIRENT_EVENTS	(FAN_MOVE | FAN_CREATE | FAN_DELETE)
+#define FANOTIFY_DIRENT_EVENTS	(FAN_MOVE | FAN_CREATE | FAN_DELETE | \
+				 FAN_RENAME)
 
 /* Events that can be reported with event->fd */
 #define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS)
-- 
cgit v1.2.3


From f1d9268e061863ead77b07f5a6807d063e28a1c2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Dec 2021 07:09:33 -0800
Subject: net: add net device refcount tracker to struct packet_type

Most notable changes are in af_packet, tipc ones are trivial.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jon Maloy <jmaloy@redhat.com>
Cc: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c06e9dc1a317..a419718612c6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2533,6 +2533,7 @@ struct packet_type {
 	__be16			type;	/* This is really htons(ether_type). */
 	bool			ignore_outgoing;
 	struct net_device	*dev;	/* NULL is wildcarded here	     */
+	netdevice_tracker	dev_tracker;
 	int			(*func) (struct sk_buff *,
 					 struct net_device *,
 					 struct packet_type *,
-- 
cgit v1.2.3


From 685b1afd7911676691c4167f420e16a957f5a38e Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Thu, 9 Dec 2021 12:09:23 +0200
Subject: net/mlx5: Introduce log_max_current_uc_list_wr_supported bit

Downstream patch will use this bit in order to know whether the device
supports changing of max_uc_list.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3636df90899a..d3899fc33fd7 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1621,7 +1621,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         ext_stride_num_range[0x1];
 	u8         roce_rw_supported[0x1];
-	u8         reserved_at_3a2[0x1];
+	u8         log_max_current_uc_list_wr_supported[0x1];
 	u8         log_max_stride_sz_rq[0x5];
 	u8         reserved_at_3a8[0x3];
 	u8         log_min_stride_sz_rq[0x5];
-- 
cgit v1.2.3


From ff5f87cb6a75dbf6d30668d2464e46249dd5c47f Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 3 Dec 2021 11:28:49 +0100
Subject: clk: Introduce clk-tps68470 driver

The TPS68470 PMIC provides Clocks, GPIOs and Regulators. At present in
the kernel the Regulators and Clocks are controlled by an OpRegion
driver designed to work with power control methods defined in ACPI, but
some platforms lack those methods, meaning drivers need to be able to
consume the resources of these chips through the usual frameworks.

This commit adds a driver for the clocks provided by the tps68470,
and is designed to bind to the platform_device registered by the
intel_skl_int3472 module.

This is based on this out of tree driver written by Intel:
https://github.com/intel/linux-intel-lts/blob/4.14/base/drivers/clk/clk-tps68470.c
with various cleanups added.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20211203102857.44539-7-hdegoede@redhat.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/mfd/tps68470.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/tps68470.h b/include/linux/mfd/tps68470.h
index ffe81127d91c..7807fa329db0 100644
--- a/include/linux/mfd/tps68470.h
+++ b/include/linux/mfd/tps68470.h
@@ -75,6 +75,17 @@
 #define TPS68470_CLKCFG1_MODE_A_MASK	GENMASK(1, 0)
 #define TPS68470_CLKCFG1_MODE_B_MASK	GENMASK(3, 2)
 
+#define TPS68470_CLKCFG2_DRV_STR_2MA	0x05
+#define TPS68470_PLL_OUTPUT_ENABLE	0x02
+#define TPS68470_CLK_SRC_XTAL		BIT(0)
+#define TPS68470_PLLSWR_DEFAULT		GENMASK(1, 0)
+#define TPS68470_OSC_EXT_CAP_DEFAULT	0x05
+
+#define TPS68470_OUTPUT_A_SHIFT		0x00
+#define TPS68470_OUTPUT_B_SHIFT		0x02
+#define TPS68470_CLK_SRC_SHIFT		GENMASK(2, 0)
+#define TPS68470_OSC_EXT_CAP_SHIFT	BIT(2)
+
 #define TPS68470_GPIO_CTL_REG_A(x)	(TPS68470_REG_GPCTL0A + (x) * 2)
 #define TPS68470_GPIO_CTL_REG_B(x)	(TPS68470_REG_GPCTL0B + (x) * 2)
 #define TPS68470_GPIO_MODE_MASK		GENMASK(1, 0)
-- 
cgit v1.2.3


From dfd0743f1d9ea76931510ed150334d571fbab49d Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Thu, 9 Dec 2021 15:59:37 +0100
Subject: tee: handle lookup of shm with reference count 0

Since the tee subsystem does not keep a strong reference to its idle
shared memory buffers, it races with other threads that try to destroy a
shared memory through a close of its dma-buf fd or by unmapping the
memory.

In tee_shm_get_from_id() when a lookup in teedev->idr has been
successful, it is possible that the tee_shm is in the dma-buf teardown
path, but that path is blocked by the teedev mutex. Since we don't have
an API to tell if the tee_shm is in the dma-buf teardown path or not we
must find another way of detecting this condition.

Fix this by doing the reference counting directly on the tee_shm using a
new refcount_t refcount field. dma-buf is replaced by using
anon_inode_getfd() instead, this separates the life-cycle of the
underlying file from the tee_shm. tee_shm_put() is updated to hold the
mutex when decreasing the refcount to 0 and then remove the tee_shm from
teedev->idr before releasing the mutex. This means that the tee_shm can
never be found unless it has a refcount larger than 0.

Fixes: 967c9cca2cc5 ("tee: generic TEE subsystem")
Cc: stable@vger.kernel.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Lars Persson <larper@axis.com>
Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Reported-by: Patrik Lantz <patrik.lantz@axis.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 include/linux/tee_drv.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index a1f03461369b..cf5999626e28 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -195,7 +195,7 @@ int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method,
  * @offset:	offset of buffer in user space
  * @pages:	locked pages from userspace
  * @num_pages:	number of locked pages
- * @dmabuf:	dmabuf used to for exporting to user space
+ * @refcount:	reference counter
  * @flags:	defined by TEE_SHM_* in tee_drv.h
  * @id:		unique id of a shared memory object on this device, shared
  *		with user space
@@ -214,7 +214,7 @@ struct tee_shm {
 	unsigned int offset;
 	struct page **pages;
 	size_t num_pages;
-	struct dma_buf *dmabuf;
+	refcount_t refcount;
 	u32 flags;
 	int id;
 	u64 sec_world_id;
-- 
cgit v1.2.3


From d1e86325af377129adb7fc6f34eb044ca6068b47 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Wed, 15 Dec 2021 15:34:15 +0000
Subject: net: phylink: add mac_select_pcs() method to phylink_mac_ops

mac_select_pcs() allows us to have an explicit point to query which
PCS the MAC wishes to use for a particular PHY interface mode, thereby
allowing us to add support to validate the link settings with the PCS.

Phylink will also use this to select the PCS to be used during a major
configuration event without the MAC driver needing to call
phylink_set_pcs().

Note that if mac_select_pcs() is present, the supported_interfaces
bitmap must be filled in; this avoids mac_select_pcs() being called
with PHY_INTERFACE_MODE_NA when we want to get support for all
interface types. Phylink will return an error in phylink_create()
unless this condition is satisfied.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index a2f266cc3442..b3086dcafeaf 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -112,6 +112,7 @@ struct phylink_config {
 /**
  * struct phylink_mac_ops - MAC operations structure.
  * @validate: Validate and update the link configuration.
+ * @mac_select_pcs: Select a PCS for the interface mode.
  * @mac_pcs_get_state: Read the current link state from the hardware.
  * @mac_prepare: prepare for a major reconfiguration of the interface.
  * @mac_config: configure the MAC for the selected mode and state.
@@ -126,6 +127,8 @@ struct phylink_mac_ops {
 	void (*validate)(struct phylink_config *config,
 			 unsigned long *supported,
 			 struct phylink_link_state *state);
+	struct phylink_pcs *(*mac_select_pcs)(struct phylink_config *config,
+					      phy_interface_t interface);
 	void (*mac_pcs_get_state)(struct phylink_config *config,
 				  struct phylink_link_state *state);
 	int (*mac_prepare)(struct phylink_config *config, unsigned int mode,
@@ -178,6 +181,21 @@ struct phylink_mac_ops {
  */
 void validate(struct phylink_config *config, unsigned long *supported,
 	      struct phylink_link_state *state);
+/**
+ * mac_select_pcs: Select a PCS for the interface mode.
+ * @config: a pointer to a &struct phylink_config.
+ * @interface: PHY interface mode for PCS
+ *
+ * Return the &struct phylink_pcs for the specified interface mode, or
+ * NULL if none is required, or an error pointer on error.
+ *
+ * This must not modify any state. It is used to query which PCS should
+ * be used. Phylink will use this during validation to ensure that the
+ * configuration is valid, and when setting a configuration to internally
+ * set the PCS that will be used.
+ */
+struct phylink_pcs *mac_select_pcs(struct phylink_config *config,
+				   phy_interface_t interface);
 
 /**
  * mac_pcs_get_state() - Read the current inband link state from the hardware
-- 
cgit v1.2.3


From 0d22d4b626a4eaa3196019092eb6c1919e9f8caa Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Wed, 15 Dec 2021 15:34:20 +0000
Subject: net: phylink: add pcs_validate() method

Add a hook for PCS to validate the link parameters. This avoids MAC
drivers having to have knowledge of their PCS in their validate()
method, thereby allowing several MAC drivers to be simplfied.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index b3086dcafeaf..713a0c928b7c 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -416,6 +416,7 @@ struct phylink_pcs {
 
 /**
  * struct phylink_pcs_ops - MAC PCS operations structure.
+ * @pcs_validate: validate the link configuration.
  * @pcs_get_state: read the current MAC PCS link state from the hardware.
  * @pcs_config: configure the MAC PCS for the selected mode and state.
  * @pcs_an_restart: restart 802.3z BaseX autonegotiation.
@@ -423,6 +424,8 @@ struct phylink_pcs {
  *               (where necessary).
  */
 struct phylink_pcs_ops {
+	int (*pcs_validate)(struct phylink_pcs *pcs, unsigned long *supported,
+			    const struct phylink_link_state *state);
 	void (*pcs_get_state)(struct phylink_pcs *pcs,
 			      struct phylink_link_state *state);
 	int (*pcs_config)(struct phylink_pcs *pcs, unsigned int mode,
@@ -435,6 +438,23 @@ struct phylink_pcs_ops {
 };
 
 #if 0 /* For kernel-doc purposes only. */
+/**
+ * pcs_validate() - validate the link configuration.
+ * @pcs: a pointer to a &struct phylink_pcs.
+ * @supported: ethtool bitmask for supported link modes.
+ * @state: a const pointer to a &struct phylink_link_state.
+ *
+ * Validate the interface mode, and advertising's autoneg bit, removing any
+ * media ethtool link modes that would not be supportable from the supported
+ * mask. Phylink will propagate the changes to the advertising mask. See the
+ * &struct phylink_mac_ops validate() method.
+ *
+ * Returns -EINVAL if the interface mode/autoneg mode is not supported.
+ * Returns non-zero positive if the link state can be supported.
+ */
+int pcs_validate(struct phylink_pcs *pcs, unsigned long *supported,
+		 const struct phylink_link_state *state);
+
 /**
  * pcs_get_state() - Read the current inband link state from the hardware
  * @pcs: a pointer to a &struct phylink_pcs.
-- 
cgit v1.2.3


From c6aeaf56f468a565f6d2f27325fc07d35cdcd3cb Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 9 Sep 2021 15:51:24 +0200
Subject: drm/tegra: Implement correct DMA-BUF semantics

DMA-BUF requires that each device that accesses a DMA-BUF attaches to it
separately. To do so the host1x_bo_pin() and host1x_bo_unpin() functions
need to be reimplemented so that they can return a mapping, which either
represents an attachment or a map of the driver's own GEM object.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/linux/host1x.h | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 7bccf589aba7..157326074df8 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -7,6 +7,7 @@
 #define __LINUX_HOST1X_H
 
 #include <linux/device.h>
+#include <linux/dma-direction.h>
 #include <linux/types.h>
 
 enum host1x_class {
@@ -82,12 +83,23 @@ struct host1x_client {
 struct host1x_bo;
 struct sg_table;
 
+struct host1x_bo_mapping {
+	struct dma_buf_attachment *attach;
+	enum dma_data_direction direction;
+	struct host1x_bo *bo;
+	struct sg_table *sgt;
+	unsigned int chunks;
+	struct device *dev;
+	dma_addr_t phys;
+	size_t size;
+};
+
 struct host1x_bo_ops {
 	struct host1x_bo *(*get)(struct host1x_bo *bo);
 	void (*put)(struct host1x_bo *bo);
-	struct sg_table *(*pin)(struct device *dev, struct host1x_bo *bo,
-				dma_addr_t *phys);
-	void (*unpin)(struct device *dev, struct sg_table *sgt);
+	struct host1x_bo_mapping *(*pin)(struct device *dev, struct host1x_bo *bo,
+					 enum dma_data_direction dir);
+	void (*unpin)(struct host1x_bo_mapping *map);
 	void *(*mmap)(struct host1x_bo *bo);
 	void (*munmap)(struct host1x_bo *bo, void *addr);
 };
@@ -112,17 +124,15 @@ static inline void host1x_bo_put(struct host1x_bo *bo)
 	bo->ops->put(bo);
 }
 
-static inline struct sg_table *host1x_bo_pin(struct device *dev,
-					     struct host1x_bo *bo,
-					     dma_addr_t *phys)
+static inline struct host1x_bo_mapping *host1x_bo_pin(struct device *dev, struct host1x_bo *bo,
+						      enum dma_data_direction dir)
 {
-	return bo->ops->pin(dev, bo, phys);
+	return bo->ops->pin(dev, bo, dir);
 }
 
-static inline void host1x_bo_unpin(struct device *dev, struct host1x_bo *bo,
-				   struct sg_table *sgt)
+static inline void host1x_bo_unpin(struct host1x_bo_mapping *map)
 {
-	bo->ops->unpin(dev, sgt);
+	map->bo->ops->unpin(map);
 }
 
 static inline void *host1x_bo_mmap(struct host1x_bo *bo)
-- 
cgit v1.2.3


From 1f39b1dfa53c84b56d7ad37fed44afda7004959d Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 7 Feb 2020 16:50:52 +0100
Subject: drm/tegra: Implement buffer object cache

This cache is used to avoid mapping and unmapping buffer objects
unnecessarily. Mappings are cached per client and stay hot until
the buffer object is destroyed.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/linux/host1x.h | 53 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 157326074df8..d0bb16cdd005 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -8,6 +8,7 @@
 
 #include <linux/device.h>
 #include <linux/dma-direction.h>
+#include <linux/spinlock.h>
 #include <linux/types.h>
 
 enum host1x_class {
@@ -24,6 +25,28 @@ struct iommu_group;
 
 u64 host1x_get_dma_mask(struct host1x *host1x);
 
+/**
+ * struct host1x_bo_cache - host1x buffer object cache
+ * @mappings: list of mappings
+ * @lock: synchronizes accesses to the list of mappings
+ */
+struct host1x_bo_cache {
+	struct list_head mappings;
+	struct mutex lock;
+};
+
+static inline void host1x_bo_cache_init(struct host1x_bo_cache *cache)
+{
+	INIT_LIST_HEAD(&cache->mappings);
+	mutex_init(&cache->lock);
+}
+
+static inline void host1x_bo_cache_destroy(struct host1x_bo_cache *cache)
+{
+	/* XXX warn if not empty? */
+	mutex_destroy(&cache->lock);
+}
+
 /**
  * struct host1x_client_ops - host1x client operations
  * @early_init: host1x client early initialization code
@@ -74,6 +97,8 @@ struct host1x_client {
 	struct host1x_client *parent;
 	unsigned int usecount;
 	struct mutex lock;
+
+	struct host1x_bo_cache cache;
 };
 
 /*
@@ -84,16 +109,26 @@ struct host1x_bo;
 struct sg_table;
 
 struct host1x_bo_mapping {
+	struct kref ref;
 	struct dma_buf_attachment *attach;
 	enum dma_data_direction direction;
+	struct list_head list;
 	struct host1x_bo *bo;
 	struct sg_table *sgt;
 	unsigned int chunks;
 	struct device *dev;
 	dma_addr_t phys;
 	size_t size;
+
+	struct host1x_bo_cache *cache;
+	struct list_head entry;
 };
 
+static inline struct host1x_bo_mapping *to_host1x_bo_mapping(struct kref *ref)
+{
+	return container_of(ref, struct host1x_bo_mapping, ref);
+}
+
 struct host1x_bo_ops {
 	struct host1x_bo *(*get)(struct host1x_bo *bo);
 	void (*put)(struct host1x_bo *bo);
@@ -106,11 +141,15 @@ struct host1x_bo_ops {
 
 struct host1x_bo {
 	const struct host1x_bo_ops *ops;
+	struct list_head mappings;
+	spinlock_t lock;
 };
 
 static inline void host1x_bo_init(struct host1x_bo *bo,
 				  const struct host1x_bo_ops *ops)
 {
+	INIT_LIST_HEAD(&bo->mappings);
+	spin_lock_init(&bo->lock);
 	bo->ops = ops;
 }
 
@@ -124,16 +163,10 @@ static inline void host1x_bo_put(struct host1x_bo *bo)
 	bo->ops->put(bo);
 }
 
-static inline struct host1x_bo_mapping *host1x_bo_pin(struct device *dev, struct host1x_bo *bo,
-						      enum dma_data_direction dir)
-{
-	return bo->ops->pin(dev, bo, dir);
-}
-
-static inline void host1x_bo_unpin(struct host1x_bo_mapping *map)
-{
-	map->bo->ops->unpin(map);
-}
+struct host1x_bo_mapping *host1x_bo_pin(struct device *dev, struct host1x_bo *bo,
+					enum dma_data_direction dir,
+					struct host1x_bo_cache *cache);
+void host1x_bo_unpin(struct host1x_bo_mapping *map);
 
 static inline void *host1x_bo_mmap(struct host1x_bo *bo)
 {
-- 
cgit v1.2.3


From 46f226c93d35b936aeec6eb31da932dc2e86f413 Mon Sep 17 00:00:00 2001
From: Mikko Perttunen <mperttunen@nvidia.com>
Date: Thu, 16 Sep 2021 17:55:17 +0300
Subject: drm/tegra: Add NVDEC driver

Add support for booting and using NVDEC on Tegra210, Tegra186
and Tegra194 to the Host1x and TegraDRM drivers. Booting in
secure mode is not currently supported.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/linux/host1x.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index d0bb16cdd005..2ca53d7ed7ca 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -17,6 +17,8 @@ enum host1x_class {
 	HOST1X_CLASS_GR2D_SB = 0x52,
 	HOST1X_CLASS_VIC = 0x5D,
 	HOST1X_CLASS_GR3D = 0x60,
+	HOST1X_CLASS_NVDEC = 0xF0,
+	HOST1X_CLASS_NVDEC1 = 0xF5,
 };
 
 struct host1x;
-- 
cgit v1.2.3


From 9ca790f44606109071ab1a3a37ed99e91794c37c Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Wed, 1 Dec 2021 02:23:16 +0300
Subject: gpu: host1x: Add host1x_channel_stop()

Add host1x_channel_stop() which waits till channel becomes idle and then
stops the channel hardware. This is needed for supporting suspend/resume
by host1x drivers since the hardware state is lost after power-gating,
thus the channel needs to be stopped before client enters into suspend.

Tested-by: Peter Geis <pgwipeout@gmail.com> # Ouya T30
Tested-by: Paul Fertser <fercerpav@gmail.com> # PAZ00 T20
Tested-by: Nicolas Chauvet <kwizart@gmail.com> # PAZ00 T20 and TK1 T124
Tested-by: Matt Merhar <mattmerhar@protonmail.com> # Ouya T30
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/linux/host1x.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 2ca53d7ed7ca..e8dc5bc41f79 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -226,6 +226,7 @@ struct host1x_job;
 
 struct host1x_channel *host1x_channel_request(struct host1x_client *client);
 struct host1x_channel *host1x_channel_get(struct host1x_channel *channel);
+void host1x_channel_stop(struct host1x_channel *channel);
 void host1x_channel_put(struct host1x_channel *channel);
 int host1x_job_submit(struct host1x_job *job);
 
-- 
cgit v1.2.3


From c0cdc89072a3e1ae3981437f385de14b7bba8fd8 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Wed, 27 Oct 2021 16:15:04 +0100
Subject: irqchip/gic-v3-its: Give the percpu rdist struct its own flags field

Later patches will require tracking some per-rdist status. Reuse the bytes
"lost" to padding within the __percpu rdist struct as a flags field, and
re-encode ->lpi_enabled within said flags.

No change in functionality intended.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211027151506.2085066-2-valentin.schneider@arm.com
---
 include/linux/irqchip/arm-gic-v3.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 81cbf85f73de..0dc34d7d735a 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -615,7 +615,7 @@ struct rdists {
 		void __iomem	*rd_base;
 		struct page	*pend_page;
 		phys_addr_t	phys_base;
-		bool		lpi_enabled;
+		u64             flags;
 		cpumask_t	*vpe_table_mask;
 		void		*vpe_l1_base;
 	} __percpu		*rdist;
-- 
cgit v1.2.3


From d23bc2bc1d634658d7fa96395419c1c553a784f0 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Wed, 27 Oct 2021 16:15:05 +0100
Subject: irqchip/gic-v3-its: Postpone LPI pending table freeing and memreserve

Memory used by the LPI tables have to be made persistent for kexec to have
a chance to work, as explained in [1]. If they have been made persistent
and we are booting into a kexec'd kernel, we also need to free the pages
that were preemptively allocated by the new kernel for those tables.

Both of those operations currently happen during its_cpu_init(), which
happens in a _STARTING (IOW atomic) cpuhp callback for secondary
CPUs. efi_mem_reserve_iomem() issues a GFP_ATOMIC allocation, which
unfortunately doesn't work under PREEMPT_RT (this ends up grabbing a
non-raw spinlock, which can sleep under PREEMPT_RT). Similarly, freeing the
pages ends up grabbing a sleepable spinlock.

Since the memreserve is only required by kexec, it doesn't have to be done
so early in the secondary boot process. Issue the reservation in a new
CPUHP_AP_ONLINE_DYN cpuhp callback, and piggy-back the page freeing on top
of it. A CPU gets to run the body of this new callback exactly once.

As kexec issues a machine_shutdown() prior to machine_kexec(), it will be
serialized vs a CPU being plugged to life by the hotplug machinery - either
the CPU will have been brought up and have had its redistributor's pending
table memreserved, or it never went online and will have its table
allocated by the new kernel.

[1]: https://lore.kernel.org/lkml/20180921195954.21574-1-marc.zyngier@arm.com/

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211027151506.2085066-3-valentin.schneider@arm.com
---
 include/linux/irqchip/arm-gic-v3.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 0dc34d7d735a..51b85506ae90 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -632,6 +632,7 @@ struct rdists {
 
 struct irq_domain;
 struct fwnode_handle;
+int __init its_lpi_memreserve_init(void);
 int its_cpu_init(void);
 int its_init(struct fwnode_handle *handle, struct rdists *rdists,
 	     struct irq_domain *domain);
-- 
cgit v1.2.3


From 835f442fdbce33a47a6bde356643fd7e3ef7ec1b Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Wed, 27 Oct 2021 16:15:06 +0100
Subject: irqchip/gic-v3-its: Limit memreserve cpuhp state lifetime

The new memreserve cpuhp callback only needs to survive up until a point
where every CPU in the system has booted once. Beyond that, it becomes a
no-op and can be put in the bin.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211027151506.2085066-4-valentin.schneider@arm.com
---
 include/linux/irqchip/arm-gic-v3.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 51b85506ae90..12d91f0dedf9 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -624,6 +624,7 @@ struct rdists {
 	u64			flags;
 	u32			gicd_typer;
 	u32			gicd_typer2;
+	int                     cpuhp_memreserve_state;
 	bool			has_vlpis;
 	bool			has_rvpeid;
 	bool			has_direct_lpi;
-- 
cgit v1.2.3


From 3c67d44de787dff288d7f2a51c372b22f7356db6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 3 Dec 2021 06:48:53 -0700
Subject: block: add mq_ops->queue_rqs hook

If we have a list of requests in our plug list, send it to the driver in
one go, if possible. The driver must set mq_ops->queue_rqs() to support
this, if not the usual one-by-one path is used.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 772f8f921526..550996cf419c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -492,6 +492,14 @@ struct blk_mq_ops {
 	 */
 	void (*commit_rqs)(struct blk_mq_hw_ctx *);
 
+	/**
+	 * @queue_rqs: Queue a list of new requests. Driver is guaranteed
+	 * that each request belongs to the same queue. If the driver doesn't
+	 * empty the @rqlist completely, then the rest will be queued
+	 * individually by the block layer upon return.
+	 */
+	void (*queue_rqs)(struct request **rqlist);
+
 	/**
 	 * @get_budget: Reserve budget before queue request, once .queue_rq is
 	 * run, it is driver's responsibility to release the
-- 
cgit v1.2.3


From 8a2ba1785c5803d59a63b6320ff54fd4a37a41ce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:21 +0100
Subject: block: remove the nr_task field from struct io_context

Nothing ever looks at ->nr_tasks, so remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/iocontext.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index c1229fbd6691..82c7f4f5f4f5 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -99,7 +99,6 @@ struct io_cq {
 struct io_context {
 	atomic_long_t refcount;
 	atomic_t active_ref;
-	atomic_t nr_tasks;
 
 	/* all the fields below are protected by this lock */
 	spinlock_t lock;
-- 
cgit v1.2.3


From a411cd3cfdc5bbd1329d5b33dbf39e2b5213969d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:27 +0100
Subject: block: move set_task_ioprio to blk-ioc.c

Keep set_task_ioprio with the other low-level code that accesses the
io_context structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/iocontext.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 82c7f4f5f4f5..648331f35fc6 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -116,8 +116,6 @@ struct task_struct;
 #ifdef CONFIG_BLOCK
 void put_io_context(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
-struct io_context *get_task_io_context(struct task_struct *task,
-				       gfp_t gfp_flags, int node);
 int __copy_io(unsigned long clone_flags, struct task_struct *tsk);
 static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
-- 
cgit v1.2.3


From 5ef1630586317e92c9ebd7b4ce48f393b7ff790f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:31 +0100
Subject: block: only build the icq tracking code when needed

Only bfq needs to code to track icq, so make it conditional.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/iocontext.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 648331f35fc6..14f7eaf1b443 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -100,16 +100,18 @@ struct io_context {
 	atomic_long_t refcount;
 	atomic_t active_ref;
 
+	unsigned short ioprio;
+
+#ifdef CONFIG_BLK_ICQ
 	/* all the fields below are protected by this lock */
 	spinlock_t lock;
 
-	unsigned short ioprio;
-
 	struct radix_tree_root	icq_tree;
 	struct io_cq __rcu	*icq_hint;
 	struct hlist_head	icq_list;
 
 	struct work_struct release_work;
+#endif /* CONFIG_BLK_ICQ */
 };
 
 struct task_struct;
-- 
cgit v1.2.3


From 85f5a74c2b9ba213d4102dc12ccbfdbe26472abb Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 8 Apr 2021 01:33:45 -0400
Subject: block: Add bio_add_folio()

This is a thin wrapper around bio_add_page().  The main advantage here
is the documentation that folios larger than 2GiB are not supported.
It's not currently possible to allocate folios that large, but if it
ever becomes possible, this function will fail gracefully instead of
doing I/O to the wrong bytes.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/bio.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index fe6bdfbbef66..a783cac49978 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -409,7 +409,8 @@ extern void bio_uninit(struct bio *);
 extern void bio_reset(struct bio *);
 void bio_chain(struct bio *, struct bio *);
 
-extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
+int bio_add_page(struct bio *, struct page *, unsigned len, unsigned off);
+bool bio_add_folio(struct bio *, struct folio *, size_t len, size_t off);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 int bio_add_zone_append_page(struct bio *bio, struct page *page,
-- 
cgit v1.2.3


From 640d1930bef4f87ec8d8d2b05f0f6edc1dfcf662 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 4 Jan 2021 10:58:17 -0500
Subject: block: Add bio_for_each_folio_all()

Allow callers to iterate over each folio instead of each page.  The
bio need not have been constructed using folios originally.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/bio.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index a783cac49978..e3c9e8207f12 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -166,7 +166,7 @@ static inline void bio_advance(struct bio *bio, unsigned int nbytes)
  */
 #define bio_for_each_bvec_all(bvl, bio, i)		\
 	for (i = 0, bvl = bio_first_bvec_all(bio);	\
-	     i < (bio)->bi_vcnt; i++, bvl++)		\
+	     i < (bio)->bi_vcnt; i++, bvl++)
 
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
@@ -260,6 +260,57 @@ static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
 	return &bio->bi_io_vec[bio->bi_vcnt - 1];
 }
 
+/**
+ * struct folio_iter - State for iterating all folios in a bio.
+ * @folio: The current folio we're iterating.  NULL after the last folio.
+ * @offset: The byte offset within the current folio.
+ * @length: The number of bytes in this iteration (will not cross folio
+ *	boundary).
+ */
+struct folio_iter {
+	struct folio *folio;
+	size_t offset;
+	size_t length;
+	/* private: for use by the iterator */
+	size_t _seg_count;
+	int _i;
+};
+
+static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio,
+				   int i)
+{
+	struct bio_vec *bvec = bio_first_bvec_all(bio) + i;
+
+	fi->folio = page_folio(bvec->bv_page);
+	fi->offset = bvec->bv_offset +
+			PAGE_SIZE * (bvec->bv_page - &fi->folio->page);
+	fi->_seg_count = bvec->bv_len;
+	fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count);
+	fi->_i = i;
+}
+
+static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio)
+{
+	fi->_seg_count -= fi->length;
+	if (fi->_seg_count) {
+		fi->folio = folio_next(fi->folio);
+		fi->offset = 0;
+		fi->length = min(folio_size(fi->folio), fi->_seg_count);
+	} else if (fi->_i + 1 < bio->bi_vcnt) {
+		bio_first_folio(fi, bio, fi->_i + 1);
+	} else {
+		fi->folio = NULL;
+	}
+}
+
+/**
+ * bio_for_each_folio_all - Iterate over each folio in a bio.
+ * @fi: struct folio_iter which is updated for each folio.
+ * @bio: struct bio to iterate over.
+ */
+#define bio_for_each_folio_all(fi, bio)				\
+	for (bio_first_folio(&fi, bio, 0); fi.folio; bio_next_folio(&fi, bio))
+
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
 	BIP_MAPPED_INTEGRITY	= 1 << 1, /* ref tag has been remapped */
-- 
cgit v1.2.3


From 8306a5f56305521d8b307b4ee1f69949fbb49279 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 28 Apr 2021 07:51:36 -0400
Subject: iomap: Add iomap_invalidate_folio

Keep iomap_invalidatepage around as a wrapper for use in address_space
operations.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/iomap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6d1b08d0ae93..29491fb9c5ba 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -225,6 +225,7 @@ void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
 int iomap_is_partially_uptodate(struct page *page, unsigned long from,
 		unsigned long count);
 int iomap_releasepage(struct page *page, gfp_t gfp_mask);
+void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
 void iomap_invalidatepage(struct page *page, unsigned int offset,
 		unsigned int len);
 #ifdef CONFIG_MIGRATION
-- 
cgit v1.2.3


From ea6fa4961aab8f90a8aa03575a98b4bda368d4b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Jo=C5=84czyk?= <mat.jonczyk@o2.pl>
Date: Fri, 10 Dec 2021 21:01:26 +0100
Subject: rtc: mc146818-lib: fix RTC presence check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To prevent an infinite loop in mc146818_get_time(),
commit 211e5db19d15 ("rtc: mc146818: Detect and handle broken RTCs")
added a check for RTC availability. Together with a later fix, it
checked if bit 6 in register 0x0d is cleared.

This, however, caused a false negative on a motherboard with an AMD
SB710 southbridge; according to the specification [1], bit 6 of register
0x0d of this chipset is a scratchbit. This caused a regression in Linux
5.11 - the RTC was determined broken by the kernel and not used by
rtc-cmos.c [3]. This problem was also reported in Fedora [4].

As a better alternative, check whether the UIP ("Update-in-progress")
bit is set for longer then 10ms. If that is the case, then apparently
the RTC is either absent (and all register reads return 0xff) or broken.
Also limit the number of loop iterations in mc146818_get_time() to 10 to
prevent an infinite loop there.

The functions mc146818_get_time() and mc146818_does_rtc_work() will be
refactored later in this patch series, in order to fix a separate
problem with reading / setting the RTC alarm time. This is done so to
avoid a confusion about what is being fixed when.

In a previous approach to this problem, I implemented a check whether
the RTC_HOURS register contains a value <= 24. This, however, sometimes
did not work correctly on my Intel Kaby Lake laptop. According to
Intel's documentation [2], "the time and date RAM locations (0-9) are
disconnected from the external bus" during the update cycle so reading
this register without checking the UIP bit is incorrect.

[1] AMD SB700/710/750 Register Reference Guide, page 308,
https://developer.amd.com/wordpress/media/2012/10/43009_sb7xx_rrg_pub_1.00.pdf

[2] 7th Generation Intel ® Processor Family I/O for U/Y Platforms [...] Datasheet
Volume 1 of 2, page 209
Intel's Document Number: 334658-006,
https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/7th-and-8th-gen-core-family-mobile-u-y-processor-lines-i-o-datasheet-vol-1.pdf

[3] Functions in arch/x86/kernel/rtc.c apparently were using it.

[4] https://bugzilla.redhat.com/show_bug.cgi?id=1936688

Fixes: 211e5db19d15 ("rtc: mc146818: Detect and handle broken RTCs")
Fixes: ebb22a059436 ("rtc: mc146818: Dont test for bit 0-5 in Register D")
Signed-off-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20211210200131.153887-5-mat.jonczyk@o2.pl
---
 include/linux/mc146818rtc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h
index 0661af17a758..69c80c4325bf 100644
--- a/include/linux/mc146818rtc.h
+++ b/include/linux/mc146818rtc.h
@@ -123,6 +123,7 @@ struct cmos_rtc_board_info {
 #define RTC_IO_EXTENT_USED      RTC_IO_EXTENT
 #endif /* ARCH_RTC_LOCATION */
 
+bool mc146818_does_rtc_work(void);
 unsigned int mc146818_get_time(struct rtc_time *time);
 int mc146818_set_time(struct rtc_time *time);
 
-- 
cgit v1.2.3


From ec5895c0f2d87b9bf4185db1915e40fa6fcfc0ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Jo=C5=84czyk?= <mat.jonczyk@o2.pl>
Date: Fri, 10 Dec 2021 21:01:27 +0100
Subject: rtc: mc146818-lib: extract mc146818_avoid_UIP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Function mc146818_get_time() contains an elaborate mechanism of reading
the RTC time while no RTC update is in progress. It turns out that
reading the RTC alarm clock also requires avoiding the RTC update.
Therefore, the mechanism in mc146818_get_time() should be reused - so
extract it into a separate function.

The logic in mc146818_avoid_UIP() is same as in mc146818_get_time()
except that after every

        if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) {

there is now "mdelay(1)".

To avoid producing a very unreadable patch, mc146818_get_time() will be
refactored to use mc146818_avoid_UIP() in the next patch.

Signed-off-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20211210200131.153887-6-mat.jonczyk@o2.pl
---
 include/linux/mc146818rtc.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h
index 69c80c4325bf..67fb0a12becc 100644
--- a/include/linux/mc146818rtc.h
+++ b/include/linux/mc146818rtc.h
@@ -127,4 +127,7 @@ bool mc146818_does_rtc_work(void);
 unsigned int mc146818_get_time(struct rtc_time *time);
 int mc146818_set_time(struct rtc_time *time);
 
+bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param),
+			void *param);
+
 #endif /* _MC146818RTC_H */
-- 
cgit v1.2.3


From 34fff62827b254f8a43633cc878deb04bf11297c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:18:54 +0100
Subject: device: Move MSI related data into a struct

The only unconditional part of MSI data in struct device is the irqdomain
pointer. Everything else can be allocated on demand. Create a data
structure and move the irqdomain pointer into it. The other MSI specific
parts are going to be removed from struct device in later steps.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20211210221813.617178827@linutronix.de
---
 include/linux/device.h | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 2a22875238a6..f212b7a7b156 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -371,6 +371,16 @@ struct dev_links_info {
 	enum dl_dev_state status;
 };
 
+/**
+ * struct dev_msi_info - Device data related to MSI
+ * @domain:	The MSI interrupt domain associated to the device
+ */
+struct dev_msi_info {
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+	struct irq_domain	*domain;
+#endif
+};
+
 /**
  * struct device - The basic device structure
  * @parent:	The device's "parent" device, the device to which it is attached.
@@ -407,8 +417,8 @@ struct dev_links_info {
  * @em_pd:	device's energy model performance domain
  * @pins:	For device pin management.
  *		See Documentation/driver-api/pin-control.rst for details.
+ * @msi:	MSI related data
  * @msi_list:	Hosts MSI descriptors
- * @msi_domain: The generic MSI domain this device is using.
  * @numa_node:	NUMA node this device is close to.
  * @dma_ops:    DMA mapping operations for this device.
  * @dma_mask:	Dma mask (if dma'ble device).
@@ -500,12 +510,10 @@ struct device {
 	struct em_perf_domain	*em_pd;
 #endif
 
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
-	struct irq_domain	*msi_domain;
-#endif
 #ifdef CONFIG_PINCTRL
 	struct dev_pin_info	*pins;
 #endif
+	struct dev_msi_info	msi;
 #ifdef CONFIG_GENERIC_MSI_IRQ
 	struct list_head	msi_list;
 #endif
@@ -666,7 +674,7 @@ static inline void set_dev_node(struct device *dev, int node)
 static inline struct irq_domain *dev_get_msi_domain(const struct device *dev)
 {
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
-	return dev->msi_domain;
+	return dev->msi.domain;
 #else
 	return NULL;
 #endif
@@ -675,7 +683,7 @@ static inline struct irq_domain *dev_get_msi_domain(const struct device *dev)
 static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d)
 {
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
-	dev->msi_domain = d;
+	dev->msi.domain = d;
 #endif
 }
 
-- 
cgit v1.2.3


From 013bd8e543c2c777b586cf033c588ea82bd502db Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:18:55 +0100
Subject: device: Add device:: Msi_data pointer and struct msi_device_data

Create struct msi_device_data and add a pointer of that type to struct
dev_msi_info, which is part of struct device. Provide an allocator function
which can be invoked from the MSI interrupt allocation code pathes.

Add a properties field to the data structure as a first member so the
allocation size is not zero bytes. The field will be uses later on.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211210221813.676660809@linutronix.de
---
 include/linux/device.h |  5 +++++
 include/linux/msi.h    | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index f212b7a7b156..f0033cd93631 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -45,6 +45,7 @@ struct iommu_ops;
 struct iommu_group;
 struct dev_pin_info;
 struct dev_iommu;
+struct msi_device_data;
 
 /**
  * struct subsys_interface - interfaces to device functions
@@ -374,11 +375,15 @@ struct dev_links_info {
 /**
  * struct dev_msi_info - Device data related to MSI
  * @domain:	The MSI interrupt domain associated to the device
+ * @data:	Pointer to MSI device data
  */
 struct dev_msi_info {
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
 	struct irq_domain	*domain;
 #endif
+#ifdef CONFIG_GENERIC_MSI_IRQ
+	struct msi_device_data	*data;
+#endif
 };
 
 /**
diff --git a/include/linux/msi.h b/include/linux/msi.h
index ba4a39c430b5..7e4c8fd7c65d 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -171,6 +171,16 @@ struct msi_desc {
 	};
 };
 
+/**
+ * msi_device_data - MSI per device data
+ * @properties:		MSI properties which are interesting to drivers
+ */
+struct msi_device_data {
+	unsigned long			properties;
+};
+
+int msi_setup_device_data(struct device *dev);
+
 /* Helpers to hide struct msi_desc implementation details */
 #define msi_desc_to_dev(desc)		((desc)->dev)
 #define dev_to_msi_list(dev)		(&(dev)->msi_list)
@@ -233,10 +243,16 @@ void pci_msi_mask_irq(struct irq_data *data);
 void pci_msi_unmask_irq(struct irq_data *data);
 
 #ifdef CONFIG_SYSFS
+int msi_device_populate_sysfs(struct device *dev);
+void msi_device_destroy_sysfs(struct device *dev);
+
 const struct attribute_group **msi_populate_sysfs(struct device *dev);
 void msi_destroy_sysfs(struct device *dev,
 		       const struct attribute_group **msi_irq_groups);
 #else
+static inline int msi_device_populate_sysfs(struct device *dev) { return 0; }
+static inline void msi_device_destroy_sysfs(struct device *dev) { }
+
 static inline const struct attribute_group **msi_populate_sysfs(struct device *dev)
 {
 	return NULL;
@@ -384,6 +400,8 @@ enum {
 	MSI_FLAG_MUST_REACTIVATE	= (1 << 5),
 	/* Is level-triggered capable, using two messages */
 	MSI_FLAG_LEVEL_CAPABLE		= (1 << 6),
+	/* Populate sysfs on alloc() and destroy it on free() */
+	MSI_FLAG_DEV_SYSFS		= (1 << 7),
 };
 
 int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
-- 
cgit v1.2.3


From 3f35d2cf9fbc656db82579d849cc69c373b1ad0d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Dec 2021 18:16:44 +0100
Subject: PCI/MSI: Decouple MSI[-X] disable from pcim_release()

The MSI core will introduce runtime allocation of MSI related data. This
data will be devres managed and has to be set up before enabling
PCI/MSI[-X]. This would introduce an ordering issue vs. pcim_release().

The setup order is:

   pcim_enable_device()
	devres_alloc(pcim_release...);
	...
	pci_irq_alloc()
	  msi_setup_device_data()
	     devres_alloc(msi_device_data_release, ...)

and once the device is released these release functions are invoked in the
opposite order:

    msi_device_data_release()
    ...
    pcim_release()
       pci_disable_msi[x]()

which is obviously wrong, because pci_disable_msi[x]() requires the MSI
data to be available to tear down the MSI[-X] interrupts.

Remove the MSI[-X] teardown from pcim_release() and add an explicit action
to be installed on the attempt of enabling PCI/MSI[-X].

This allows the MSI core data allocation to be ordered correctly in a
subsequent step.

Reported-by: Nishanth Menon <nm@ti.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/87tuf9rdoj.ffs@tglx
---
 include/linux/pci.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 5cc46baef519..a09736d3e05e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -425,7 +425,8 @@ struct pci_dev {
 	unsigned int	ats_enabled:1;		/* Address Translation Svc */
 	unsigned int	pasid_enabled:1;	/* Process Address Space ID */
 	unsigned int	pri_enabled:1;		/* Page Request Interface */
-	unsigned int	is_managed:1;
+	unsigned int	is_managed:1;		/* Managed via devres */
+	unsigned int	is_msi_managed:1;	/* MSI release via devres installed */
 	unsigned int	needs_freset:1;		/* Requires fundamental reset */
 	unsigned int	state_saved:1;
 	unsigned int	is_physfn:1;
-- 
cgit v1.2.3


From bf6e054e0e3fbc9614355b760e18c8a14f952a4e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:03 +0100
Subject: genirq/msi: Provide msi_device_populate/destroy_sysfs()

Add new allocation functions which can be activated by domain info
flags. They store the groups pointer in struct msi_device_data.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211210221813.988659194@linutronix.de
---
 include/linux/msi.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 7e4c8fd7c65d..1b96dc483b88 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -56,6 +56,8 @@ struct irq_data;
 struct msi_desc;
 struct pci_dev;
 struct platform_msi_priv_data;
+struct attribute_group;
+
 void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 #ifdef CONFIG_GENERIC_MSI_IRQ
 void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
@@ -174,9 +176,11 @@ struct msi_desc {
 /**
  * msi_device_data - MSI per device data
  * @properties:		MSI properties which are interesting to drivers
+ * @attrs:		Pointer to the sysfs attribute group
  */
 struct msi_device_data {
 	unsigned long			properties;
+	const struct attribute_group    **attrs;
 };
 
 int msi_setup_device_data(struct device *dev);
-- 
cgit v1.2.3


From ffd84485e6beb9cad3e5a133d88201b995298c33 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:05 +0100
Subject: PCI/MSI: Let the irq code handle sysfs groups

Set the domain info flag which makes the core code handle sysfs groups and
put an explicit invocation into the legacy code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20211210221814.048612053@linutronix.de
---
 include/linux/pci.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index a09736d3e05e..0a7b6b2f163b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -476,7 +476,6 @@ struct pci_dev {
 #ifdef CONFIG_PCI_MSI
 	void __iomem	*msix_base;
 	raw_spinlock_t	msi_lock;
-	const struct attribute_group **msi_irq_groups;
 #endif
 	struct pci_vpd	vpd;
 #ifdef CONFIG_PCIE_DPC
-- 
cgit v1.2.3


From 24cff375fdb663c2238f06693a067b9219596fdc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:08 +0100
Subject: genirq/msi: Remove the original sysfs interfaces

No more users. Refactor the core code accordingly and move the global
interface under CONFIG_PCI_MSI_ARCH_FALLBACKS.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211210221814.168362229@linutronix.de
---
 include/linux/msi.h | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 1b96dc483b88..634a12962e72 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -249,22 +249,10 @@ void pci_msi_unmask_irq(struct irq_data *data);
 #ifdef CONFIG_SYSFS
 int msi_device_populate_sysfs(struct device *dev);
 void msi_device_destroy_sysfs(struct device *dev);
-
-const struct attribute_group **msi_populate_sysfs(struct device *dev);
-void msi_destroy_sysfs(struct device *dev,
-		       const struct attribute_group **msi_irq_groups);
-#else
+#else /* CONFIG_SYSFS */
 static inline int msi_device_populate_sysfs(struct device *dev) { return 0; }
 static inline void msi_device_destroy_sysfs(struct device *dev) { }
-
-static inline const struct attribute_group **msi_populate_sysfs(struct device *dev)
-{
-	return NULL;
-}
-static inline void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups)
-{
-}
-#endif
+#endif /* !CONFIG_SYSFS */
 
 /*
  * The arch hooks to setup up msi irqs. Default functions are implemented
@@ -279,7 +267,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
 void arch_teardown_msi_irq(unsigned int irq);
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void arch_teardown_msi_irqs(struct pci_dev *dev);
-#endif
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */
 
 /*
  * The restore hook is still available even for fully irq domain based
-- 
cgit v1.2.3


From 9835cec6d557b0bff3d48bd91cd0484aba59386c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:09 +0100
Subject: platform-msi: Rename functions and clarify comments

It's hard to distinguish what platform_msi_domain_alloc() and
platform_msi_domain_alloc_irqs() are about. Make the distinction more
explicit and add comments which explain the use cases properly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211210221814.228706214@linutronix.de
---
 include/linux/msi.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 634a12962e72..12dd28629c43 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -435,10 +435,10 @@ __platform_msi_create_device_domain(struct device *dev,
 #define platform_msi_create_device_tree_domain(dev, nvec, write, ops, data) \
 	__platform_msi_create_device_domain(dev, nvec, true, write, ops, data)
 
-int platform_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
-			      unsigned int nr_irqs);
-void platform_msi_domain_free(struct irq_domain *domain, unsigned int virq,
-			      unsigned int nvec);
+int platform_msi_device_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				     unsigned int nr_irqs);
+void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int virq,
+				     unsigned int nvec);
 void *platform_msi_get_host_data(struct irq_domain *domain);
 #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
 
-- 
cgit v1.2.3


From fc22e7dbcdb3e06a3d3ce05fc91c6a2345411f9b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:11 +0100
Subject: platform-msi: Store platform private data pointer in msi_device_data

Storing the platform private data in a MSI descriptor is sloppy at
best. The data belongs to the device and not to the descriptor.
Add a pointer to struct msi_device_data and store the pointer there.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211210221814.287680528@linutronix.de
---
 include/linux/msi.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 12dd28629c43..cdf0d09c3ad4 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -108,11 +108,9 @@ struct pci_msi_desc {
 
 /**
  * platform_msi_desc - Platform device specific msi descriptor data
- * @msi_priv_data:	Pointer to platform private data
  * @msi_index:		The index of the MSI descriptor for multi MSI
  */
 struct platform_msi_desc {
-	struct platform_msi_priv_data	*msi_priv_data;
 	u16				msi_index;
 };
 
@@ -177,10 +175,12 @@ struct msi_desc {
  * msi_device_data - MSI per device data
  * @properties:		MSI properties which are interesting to drivers
  * @attrs:		Pointer to the sysfs attribute group
+ * @platform_data:	Platform-MSI specific data
  */
 struct msi_device_data {
 	unsigned long			properties;
 	const struct attribute_group    **attrs;
+	struct platform_msi_priv_data	*platform_data;
 };
 
 int msi_setup_device_data(struct device *dev);
-- 
cgit v1.2.3


From 20c6d424cfe641659c9a025db8a8608701b27246 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:12 +0100
Subject: genirq/msi: Consolidate MSI descriptor data

All non PCI/MSI usage variants have data structures in struct msi_desc with
only one member: xxx_index. PCI/MSI has a entry_nr member.

Add a common msi_index member to struct msi_desc so all implementations can
share it which allows further consolidation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211210221814.350967317@linutronix.de
---
 include/linux/msi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index cdf0d09c3ad4..ee8fe49dedd1 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -143,6 +143,7 @@ struct ti_sci_inta_msi_desc {
  *			address or data changes
  * @write_msi_msg_data:	Data parameter for the callback.
  *
+ * @msi_index:	Index of the msi descriptor
  * @pci:	[PCI]	    PCI speficic msi descriptor data
  * @platform:	[platform]  Platform device specific msi descriptor data
  * @fsl_mc:	[fsl-mc]    FSL MC device specific msi descriptor data
@@ -163,6 +164,7 @@ struct msi_desc {
 	void (*write_msi_msg)(struct msi_desc *entry, void *data);
 	void *write_msi_msg_data;
 
+	u16				msi_index;
 	union {
 		struct pci_msi_desc		pci;
 		struct platform_msi_desc	platform;
-- 
cgit v1.2.3


From dba27c7fa36f468e7eb29b216879f8c33bf0955d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:14 +0100
Subject: platform-msi: Use msi_desc::msi_index

Use the common msi_index member and get rid of the pointless wrapper struct.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211210221814.413638645@linutronix.de
---
 include/linux/msi.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index ee8fe49dedd1..1d85e954e130 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -106,14 +106,6 @@ struct pci_msi_desc {
 	};
 };
 
-/**
- * platform_msi_desc - Platform device specific msi descriptor data
- * @msi_index:		The index of the MSI descriptor for multi MSI
- */
-struct platform_msi_desc {
-	u16				msi_index;
-};
-
 /**
  * fsl_mc_msi_desc - FSL-MC device specific msi descriptor data
  * @msi_index:		The index of the MSI descriptor
@@ -145,7 +137,6 @@ struct ti_sci_inta_msi_desc {
  *
  * @msi_index:	Index of the msi descriptor
  * @pci:	[PCI]	    PCI speficic msi descriptor data
- * @platform:	[platform]  Platform device specific msi descriptor data
  * @fsl_mc:	[fsl-mc]    FSL MC device specific msi descriptor data
  * @inta:	[INTA]	    TISCI based INTA specific msi descriptor data
  */
@@ -167,7 +158,6 @@ struct msi_desc {
 	u16				msi_index;
 	union {
 		struct pci_msi_desc		pci;
-		struct platform_msi_desc	platform;
 		struct fsl_mc_msi_desc		fsl_mc;
 		struct ti_sci_inta_msi_desc	inta;
 	};
-- 
cgit v1.2.3


From 78ee9fb4b8b126ed84a819a6e1732fd3039b525a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:15 +0100
Subject: bus: fsl-mc-msi: Use msi_desc::msi_index

Use the common msi_index member and get rid of the pointless wrapper struct.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211210221814.477386185@linutronix.de
---
 include/linux/msi.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 1d85e954e130..25edf83ede41 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -106,14 +106,6 @@ struct pci_msi_desc {
 	};
 };
 
-/**
- * fsl_mc_msi_desc - FSL-MC device specific msi descriptor data
- * @msi_index:		The index of the MSI descriptor
- */
-struct fsl_mc_msi_desc {
-	u16				msi_index;
-};
-
 /**
  * ti_sci_inta_msi_desc - TISCI based INTA specific msi descriptor data
  * @dev_index: TISCI device index
@@ -137,7 +129,6 @@ struct ti_sci_inta_msi_desc {
  *
  * @msi_index:	Index of the msi descriptor
  * @pci:	[PCI]	    PCI speficic msi descriptor data
- * @fsl_mc:	[fsl-mc]    FSL MC device specific msi descriptor data
  * @inta:	[INTA]	    TISCI based INTA specific msi descriptor data
  */
 struct msi_desc {
@@ -158,7 +149,6 @@ struct msi_desc {
 	u16				msi_index;
 	union {
 		struct pci_msi_desc		pci;
-		struct fsl_mc_msi_desc		fsl_mc;
 		struct ti_sci_inta_msi_desc	inta;
 	};
 };
-- 
cgit v1.2.3


From 0f18095871fc59c89a281caf6f18538cf9e50fbf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:17 +0100
Subject: soc: ti: ti_sci_inta_msi: Use msi_desc::msi_index

Use the common msi_index member and get rid of the pointless wrapper struct.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Nishanth Menon <nm@ti.com>
Link: https://lore.kernel.org/r/20211210221814.540704224@linutronix.de
---
 include/linux/msi.h | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 25edf83ede41..45ec5d07a5f3 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -106,14 +106,6 @@ struct pci_msi_desc {
 	};
 };
 
-/**
- * ti_sci_inta_msi_desc - TISCI based INTA specific msi descriptor data
- * @dev_index: TISCI device index
- */
-struct ti_sci_inta_msi_desc {
-	u16	dev_index;
-};
-
 /**
  * struct msi_desc - Descriptor structure for MSI based interrupts
  * @list:	List head for management
@@ -128,8 +120,7 @@ struct ti_sci_inta_msi_desc {
  * @write_msi_msg_data:	Data parameter for the callback.
  *
  * @msi_index:	Index of the msi descriptor
- * @pci:	[PCI]	    PCI speficic msi descriptor data
- * @inta:	[INTA]	    TISCI based INTA specific msi descriptor data
+ * @pci:	PCI specific msi descriptor data
  */
 struct msi_desc {
 	/* Shared device/bus type independent data */
@@ -147,10 +138,7 @@ struct msi_desc {
 	void *write_msi_msg_data;
 
 	u16				msi_index;
-	union {
-		struct pci_msi_desc		pci;
-		struct ti_sci_inta_msi_desc	inta;
-	};
+	struct pci_msi_desc		pci;
 };
 
 /**
-- 
cgit v1.2.3


From 173ffad79d177d9a91fbf3be6bf67ca81e0f765a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:18 +0100
Subject: PCI/MSI: Use msi_desc::msi_index

The usage of msi_desc::pci::entry_nr is confusing at best. It's the index
into the MSI[X] descriptor table.

Use msi_desc::msi_index which is shared between all MSI incarnations
instead of having a PCI specific storage for no value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20211210221814.602911509@linutronix.de
---
 include/linux/msi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 45ec5d07a5f3..b3d3b0bf59fe 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -80,7 +80,6 @@ typedef void (*irq_write_msi_msg_t)(struct msi_desc *desc,
  * @multi_cap:	[PCI MSI/X] log2 num of messages supported
  * @can_mask:	[PCI MSI/X] Masking supported?
  * @is_64:	[PCI MSI/X] Address size: 0=32bit 1=64bit
- * @entry_nr:	[PCI MSI/X] Entry which is described by this descriptor
  * @default_irq:[PCI MSI/X] The default pre-assigned non-MSI irq
  * @mask_pos:	[PCI MSI]   Mask register position
  * @mask_base:	[PCI MSI-X] Mask register base address
@@ -97,7 +96,6 @@ struct pci_msi_desc {
 		u8	can_mask	: 1;
 		u8	is_64		: 1;
 		u8	is_virtual	: 1;
-		u16	entry_nr;
 		unsigned default_irq;
 	} msi_attrib;
 	union {
-- 
cgit v1.2.3


From 7a823443e9b4ed1ff4a3026d184f09d23fd6d9c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:20 +0100
Subject: PCI/MSI: Provide MSI_FLAG_MSIX_CONTIGUOUS

Provide a domain info flag which makes the core code check for a contiguous
MSI-X index on allocation. That's simpler than checking it at some other
domain callback in architecture code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20211210221814.662401116@linutronix.de
---
 include/linux/msi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index b3d3b0bf59fe..d206239e6fa8 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -362,6 +362,8 @@ enum {
 	MSI_FLAG_LEVEL_CAPABLE		= (1 << 6),
 	/* Populate sysfs on alloc() and destroy it on free() */
 	MSI_FLAG_DEV_SYSFS		= (1 << 7),
+	/* MSI-X entries must be contiguous */
+	MSI_FLAG_MSIX_CONTIGUOUS	= (1 << 8),
 };
 
 int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
-- 
cgit v1.2.3


From cf15f43acaad31dabb2646cef170a506a1d663eb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:23 +0100
Subject: genirq/msi: Provide interface to retrieve Linux interrupt number

This allows drivers to retrieve the Linux interrupt number instead of
fiddling with MSI descriptors.

msi_get_virq() returns the Linux interrupt number or 0 in case that there
is no entry for the given MSI index.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211210221814.780824745@linutronix.de
---
 include/linux/msi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index d206239e6fa8..7593fc383dba 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -153,6 +153,8 @@ struct msi_device_data {
 
 int msi_setup_device_data(struct device *dev);
 
+unsigned int msi_get_virq(struct device *dev, unsigned int index);
+
 /* Helpers to hide struct msi_desc implementation details */
 #define msi_desc_to_dev(desc)		((desc)->dev)
 #define dev_to_msi_list(dev)		(&(dev)->msi_list)
-- 
cgit v1.2.3


From d86a6d47bcc6b41fe2a4e13313d66a772d00382f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:34 +0100
Subject: bus: fsl-mc: fsl-mc-allocator: Rework MSI handling

Storing a pointer to the MSI descriptor just to track the Linux interrupt
number is daft. Just store the interrupt number and be done with it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211210221815.207838579@linutronix.de
---
 include/linux/fsl/mc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index e026f6c48b49..7b6c42bfb660 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -91,13 +91,13 @@ struct fsl_mc_resource {
 
 /**
  * struct fsl_mc_device_irq - MC object device message-based interrupt
- * @msi_desc: pointer to MSI descriptor allocated by fsl_mc_msi_alloc_descs()
+ * @virq: Linux virtual interrupt number
  * @mc_dev: MC object device that owns this interrupt
  * @dev_irq_index: device-relative IRQ index
  * @resource: MC generic resource associated with the interrupt
  */
 struct fsl_mc_device_irq {
-	struct msi_desc *msi_desc;
+	unsigned int virq;
 	struct fsl_mc_device *mc_dev;
 	u8 dev_irq_index;
 	struct fsl_mc_resource resource;
-- 
cgit v1.2.3


From 89e0032ec201f76c86d6e3e6f94574dfb8e39b71 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Dec 2021 23:19:35 +0100
Subject: soc: ti: ti_sci_inta_msi: Get rid of ti_sci_inta_msi_get_virq()

Just use the core function msi_get_virq().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Vinod Koul <vkoul@kernel.org>
Acked-by: Nishanth Menon <nm@ti.com>
Link: https://lore.kernel.org/r/20211210221815.269468319@linutronix.de
---
 include/linux/soc/ti/ti_sci_inta_msi.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/ti/ti_sci_inta_msi.h b/include/linux/soc/ti/ti_sci_inta_msi.h
index e3aa8b14612e..25ea78a8ea5c 100644
--- a/include/linux/soc/ti/ti_sci_inta_msi.h
+++ b/include/linux/soc/ti/ti_sci_inta_msi.h
@@ -18,6 +18,5 @@ struct irq_domain
 				   struct irq_domain *parent);
 int ti_sci_inta_msi_domain_alloc_irqs(struct device *dev,
 				      struct ti_sci_resource *res);
-unsigned int ti_sci_inta_msi_get_virq(struct device *dev, u32 index);
 void ti_sci_inta_msi_domain_free_irqs(struct device *dev);
 #endif /* __INCLUDE_LINUX_IRQCHIP_TI_SCI_INTA_H */
-- 
cgit v1.2.3


From 125282cd4f33ecd53a24ae4807409da0e5e90fd4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:04 +0100
Subject: genirq/msi: Move descriptor list to struct msi_device_data

It's only required when MSI is in use.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210747.650487479@linutronix.de
---
 include/linux/device.h | 4 ----
 include/linux/msi.h    | 4 +++-
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index f0033cd93631..93459724dcde 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -423,7 +423,6 @@ struct dev_msi_info {
  * @pins:	For device pin management.
  *		See Documentation/driver-api/pin-control.rst for details.
  * @msi:	MSI related data
- * @msi_list:	Hosts MSI descriptors
  * @numa_node:	NUMA node this device is close to.
  * @dma_ops:    DMA mapping operations for this device.
  * @dma_mask:	Dma mask (if dma'ble device).
@@ -519,9 +518,6 @@ struct device {
 	struct dev_pin_info	*pins;
 #endif
 	struct dev_msi_info	msi;
-#ifdef CONFIG_GENERIC_MSI_IRQ
-	struct list_head	msi_list;
-#endif
 #ifdef CONFIG_DMA_OPS
 	const struct dma_map_ops *dma_ops;
 #endif
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 7593fc383dba..4223e47103ed 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -144,11 +144,13 @@ struct msi_desc {
  * @properties:		MSI properties which are interesting to drivers
  * @attrs:		Pointer to the sysfs attribute group
  * @platform_data:	Platform-MSI specific data
+ * @list:		List of MSI descriptors associated to the device
  */
 struct msi_device_data {
 	unsigned long			properties;
 	const struct attribute_group    **attrs;
 	struct platform_msi_priv_data	*platform_data;
+	struct list_head		list;
 };
 
 int msi_setup_device_data(struct device *dev);
@@ -157,7 +159,7 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index);
 
 /* Helpers to hide struct msi_desc implementation details */
 #define msi_desc_to_dev(desc)		((desc)->dev)
-#define dev_to_msi_list(dev)		(&(dev)->msi_list)
+#define dev_to_msi_list(dev)		(&(dev)->msi.data->list)
 #define first_msi_entry(dev)		\
 	list_first_entry(dev_to_msi_list((dev)), struct msi_desc, list)
 #define for_each_msi_entry(desc, dev)	\
-- 
cgit v1.2.3


From b5f687f97d1e112493fe0447a1fb09fbd93c334b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:05 +0100
Subject: genirq/msi: Add mutex for MSI list protection

For upcoming runtime extensions of MSI-X interrupts it's required to
protect the MSI descriptor list. Add a mutex to struct msi_device_data and
provide lock/unlock functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210747.708877269@linutronix.de
---
 include/linux/msi.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 4223e47103ed..2cf6c530588d 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -3,6 +3,7 @@
 #define LINUX_MSI_H
 
 #include <linux/cpumask.h>
+#include <linux/mutex.h>
 #include <linux/list.h>
 #include <asm/msi.h>
 
@@ -145,17 +146,21 @@ struct msi_desc {
  * @attrs:		Pointer to the sysfs attribute group
  * @platform_data:	Platform-MSI specific data
  * @list:		List of MSI descriptors associated to the device
+ * @mutex:		Mutex protecting the MSI list
  */
 struct msi_device_data {
 	unsigned long			properties;
 	const struct attribute_group    **attrs;
 	struct platform_msi_priv_data	*platform_data;
 	struct list_head		list;
+	struct mutex			mutex;
 };
 
 int msi_setup_device_data(struct device *dev);
 
 unsigned int msi_get_virq(struct device *dev, unsigned int index);
+void msi_lock_descs(struct device *dev);
+void msi_unlock_descs(struct device *dev);
 
 /* Helpers to hide struct msi_desc implementation details */
 #define msi_desc_to_dev(desc)		((desc)->dev)
-- 
cgit v1.2.3


From 0f62d941acf9ac3b6025692ce649b1f282b89e7f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:07 +0100
Subject: genirq/msi: Provide msi_domain_alloc/free_irqs_descs_locked()

Usage sites which do allocations of the MSI descriptors before invoking
msi_domain_alloc_irqs() require to lock the MSI decriptors accross the
operation.

Provide entry points which can be called with the MSI mutex held and lock
the mutex in the existing entry points.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210747.765371053@linutronix.de
---
 include/linux/msi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 2cf6c530588d..69c588efe85b 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -383,9 +383,12 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
 					 struct irq_domain *parent);
 int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			    int nvec);
+int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev,
+				       int nvec);
 int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			  int nvec);
 void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
+void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev);
 void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
 struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain);
 
-- 
cgit v1.2.3


From 1046f71d7268b1680d7b044dea83c664403f6302 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:08 +0100
Subject: genirq/msi: Provide a set of advanced MSI accessors and iterators

In preparation for dynamic handling of MSI-X interrupts provide a new set
of MSI descriptor accessor functions and iterators. They are benefitial per
se as they allow to cleanup quite some code in various MSI domain
implementations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210747.818635078@linutronix.de
---
 include/linux/msi.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 69c588efe85b..703221f7e9ea 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -140,6 +140,18 @@ struct msi_desc {
 	struct pci_msi_desc		pci;
 };
 
+/*
+ * Filter values for the MSI descriptor iterators and accessor functions.
+ */
+enum msi_desc_filter {
+	/* All descriptors */
+	MSI_DESC_ALL,
+	/* Descriptors which have no interrupt associated */
+	MSI_DESC_NOTASSOCIATED,
+	/* Descriptors which have an interrupt associated */
+	MSI_DESC_ASSOCIATED,
+};
+
 /**
  * msi_device_data - MSI per device data
  * @properties:		MSI properties which are interesting to drivers
@@ -147,6 +159,7 @@ struct msi_desc {
  * @platform_data:	Platform-MSI specific data
  * @list:		List of MSI descriptors associated to the device
  * @mutex:		Mutex protecting the MSI list
+ * @__next:		Cached pointer to the next entry for iterators
  */
 struct msi_device_data {
 	unsigned long			properties;
@@ -154,6 +167,7 @@ struct msi_device_data {
 	struct platform_msi_priv_data	*platform_data;
 	struct list_head		list;
 	struct mutex			mutex;
+	struct msi_desc			*__next;
 };
 
 int msi_setup_device_data(struct device *dev);
@@ -162,6 +176,25 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index);
 void msi_lock_descs(struct device *dev);
 void msi_unlock_descs(struct device *dev);
 
+struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter);
+struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter);
+
+/**
+ * msi_for_each_desc - Iterate the MSI descriptors
+ *
+ * @desc:	struct msi_desc pointer used as iterator
+ * @dev:	struct device pointer - device to iterate
+ * @filter:	Filter for descriptor selection
+ *
+ * Notes:
+ *  - The loop must be protected with a msi_lock_descs()/msi_unlock_descs()
+ *    pair.
+ *  - It is safe to remove a retrieved MSI descriptor in the loop.
+ */
+#define msi_for_each_desc(desc, dev, filter)			\
+	for ((desc) = msi_first_desc((dev), (filter)); (desc);	\
+	     (desc) = msi_next_desc((dev), (filter)))
+
 /* Helpers to hide struct msi_desc implementation details */
 #define msi_desc_to_dev(desc)		((desc)->dev)
 #define dev_to_msi_list(dev)		(&(dev)->msi.data->list)
-- 
cgit v1.2.3


From 602905253607ba892336f7bba8bb45b5be819d87 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:10 +0100
Subject: genirq/msi: Provide msi_alloc_msi_desc() and a simple allocator

Provide msi_alloc_msi_desc() which takes a template MSI descriptor for
initializing a newly allocated descriptor. This allows to simplify various
usage sites of alloc_msi_entry() and moves the storage handling into the
core code.

For simple cases where only a linear vector space is required provide
msi_add_simple_msi_descs() which just allocates a linear range of MSI
descriptors and fills msi_desc::msi_index accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210747.873833567@linutronix.de
---
 include/linux/msi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 703221f7e9ea..bbb8c1e2c18b 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -247,6 +247,8 @@ static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
 }
 #endif /* CONFIG_PCI_MSI */
 
+int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc);
+
 struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
 				 const struct irq_affinity_desc *affinity);
 void free_msi_entry(struct msi_desc *entry);
-- 
cgit v1.2.3


From 645474e2cee450131e8b8d8a69a5d9bbabd43f3f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:12 +0100
Subject: genirq/msi: Provide domain flags to allocate/free MSI descriptors
 automatically

Provide domain info flags which tell the core to allocate simple
descriptors or to free descriptors when the interrupts are freed and
implement the required functionality.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210747.928198636@linutronix.de
---
 include/linux/msi.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index bbb8c1e2c18b..17e47ab8d57a 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -105,6 +105,8 @@ struct pci_msi_desc {
 	};
 };
 
+#define MSI_MAX_INDEX		((unsigned int)USHRT_MAX)
+
 /**
  * struct msi_desc - Descriptor structure for MSI based interrupts
  * @list:	List head for management
@@ -248,6 +250,17 @@ static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
 #endif /* CONFIG_PCI_MSI */
 
 int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc);
+void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
+			      unsigned int first_index, unsigned int last_index);
+
+/**
+ * msi_free_msi_descs - Free MSI descriptors of a device
+ * @dev:	Device to free the descriptors
+ */
+static inline void msi_free_msi_descs(struct device *dev)
+{
+	msi_free_msi_descs_range(dev, MSI_DESC_ALL, 0, MSI_MAX_INDEX);
+}
 
 struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
 				 const struct irq_affinity_desc *affinity);
@@ -408,6 +421,10 @@ enum {
 	MSI_FLAG_DEV_SYSFS		= (1 << 7),
 	/* MSI-X entries must be contiguous */
 	MSI_FLAG_MSIX_CONTIGUOUS	= (1 << 8),
+	/* Allocate simple MSI descriptors */
+	MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS	= (1 << 9),
+	/* Free MSI descriptors */
+	MSI_FLAG_FREE_MSI_DESCS		= (1 << 10),
 };
 
 int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
-- 
cgit v1.2.3


From 7ad321a5eadb52b4af1c577dda51783e08235ea7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:37 +0100
Subject: soc: ti: ti_sci_inta_msi: Remove ti_sci_inta_msi_domain_free_irqs()

The function has no users and is pointless now that the core frees the MSI
descriptors, which means potential users can just use msi_domain_free_irqs().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210748.793119155@linutronix.de
---
 include/linux/soc/ti/ti_sci_inta_msi.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/ti/ti_sci_inta_msi.h b/include/linux/soc/ti/ti_sci_inta_msi.h
index 25ea78a8ea5c..4dba2f2aff6f 100644
--- a/include/linux/soc/ti/ti_sci_inta_msi.h
+++ b/include/linux/soc/ti/ti_sci_inta_msi.h
@@ -18,5 +18,4 @@ struct irq_domain
 				   struct irq_domain *parent);
 int ti_sci_inta_msi_domain_alloc_irqs(struct device *dev,
 				      struct ti_sci_resource *res);
-void ti_sci_inta_msi_domain_free_irqs(struct device *dev);
 #endif /* __INCLUDE_LINUX_IRQCHIP_TI_SCI_INTA_H */
-- 
cgit v1.2.3


From ef8dd01538ea2553ab101ddce6a85a321406d9c0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:44 +0100
Subject: genirq/msi: Make interrupt allocation less convoluted

There is no real reason to do several loops over the MSI descriptors
instead of just doing one loop. In case of an error everything is undone
anyway so it does not matter whether it's a partial or a full rollback.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210749.010234767@linutronix.de
---
 include/linux/msi.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 17e47ab8d57a..e8dd0be17e89 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -206,12 +206,6 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter);
 	list_for_each_entry((desc), dev_to_msi_list((dev)), list)
 #define for_each_msi_entry_safe(desc, tmp, dev)	\
 	list_for_each_entry_safe((desc), (tmp), dev_to_msi_list((dev)), list)
-#define for_each_msi_vector(desc, __irq, dev)				\
-	for_each_msi_entry((desc), (dev))				\
-		if ((desc)->irq)					\
-			for (__irq = (desc)->irq;			\
-			     __irq < ((desc)->irq + (desc)->nvec_used);	\
-			     __irq++)
 
 #ifdef CONFIG_IRQ_MSI_IOMMU
 static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc)
-- 
cgit v1.2.3


From cc9a246dbf6bdef56d9eee296a1db52dd0607976 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:47 +0100
Subject: genirq/msi: Mop up old interfaces

Get rid of the old iterators, alloc/free functions and adjust the core code
accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210749.117395027@linutronix.de
---
 include/linux/msi.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index e8dd0be17e89..b54010ba7b0d 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -197,15 +197,7 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter);
 	for ((desc) = msi_first_desc((dev), (filter)); (desc);	\
 	     (desc) = msi_next_desc((dev), (filter)))
 
-/* Helpers to hide struct msi_desc implementation details */
 #define msi_desc_to_dev(desc)		((desc)->dev)
-#define dev_to_msi_list(dev)		(&(dev)->msi.data->list)
-#define first_msi_entry(dev)		\
-	list_first_entry(dev_to_msi_list((dev)), struct msi_desc, list)
-#define for_each_msi_entry(desc, dev)	\
-	list_for_each_entry((desc), dev_to_msi_list((dev)), list)
-#define for_each_msi_entry_safe(desc, tmp, dev)	\
-	list_for_each_entry_safe((desc), (tmp), dev_to_msi_list((dev)), list)
 
 #ifdef CONFIG_IRQ_MSI_IOMMU
 static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc)
@@ -231,10 +223,6 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc,
 #endif
 
 #ifdef CONFIG_PCI_MSI
-#define first_pci_msi_entry(pdev)	first_msi_entry(&(pdev)->dev)
-#define for_each_pci_msi_entry(desc, pdev)	\
-	for_each_msi_entry((desc), &(pdev)->dev)
-
 struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc);
 void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg);
 #else /* CONFIG_PCI_MSI */
@@ -256,9 +244,6 @@ static inline void msi_free_msi_descs(struct device *dev)
 	msi_free_msi_descs_range(dev, MSI_DESC_ALL, 0, MSI_MAX_INDEX);
 }
 
-struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
-				 const struct irq_affinity_desc *affinity);
-void free_msi_entry(struct msi_desc *entry);
 void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 
-- 
cgit v1.2.3


From ef3350c53d2aac65cf1c4ecc968bbb1de5f421ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:49 +0100
Subject: genirq/msi: Add abuse prevention comment to msi header

Hope dies last.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210749.170847844@linutronix.de
---
 include/linux/msi.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index b54010ba7b0d..70cc6a555a8e 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -2,6 +2,20 @@
 #ifndef LINUX_MSI_H
 #define LINUX_MSI_H
 
+/*
+ * This header file contains MSI data structures and functions which are
+ * only relevant for:
+ *	- Interrupt core code
+ *	- PCI/MSI core code
+ *	- MSI interrupt domain implementations
+ *	- IOMMU, low level VFIO, NTB and other justified exceptions
+ *	  dealing with low level MSI details.
+ *
+ * Regular device drivers have no business with any of these functions and
+ * especially storing MSI descriptor pointers in random code is considered
+ * abuse. The only function which is relevant for drivers is msi_get_virq().
+ */
+
 #include <linux/cpumask.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-- 
cgit v1.2.3


From bf5e758f02fc739589dcc6a3395c3a3eb77b5c90 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:50 +0100
Subject: genirq/msi: Simplify sysfs handling

The sysfs handling for MSI is a convoluted maze and it is in the way of
supporting dynamic expansion of the MSI-X vectors because it only supports
a one off bulk population/free of the sysfs entries.

Change it to do:

   1) Creating an empty sysfs attribute group when msi_device_data is
      allocated

   2) Populate the entries when the MSI descriptor is initialized

   3) Free the entries when a MSI descriptor is detached from a Linux
      interrupt.

   4) Provide functions for the legacy non-irqdomain fallback code to
      do a bulk population/free. This code won't support dynamic
      expansion.

This makes the code simpler and reduces the number of allocations as the
empty attribute group can be shared.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210749.224917330@linutronix.de
---
 include/linux/msi.h | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 70cc6a555a8e..1a00367d2cfa 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -71,7 +71,7 @@ struct irq_data;
 struct msi_desc;
 struct pci_dev;
 struct platform_msi_priv_data;
-struct attribute_group;
+struct device_attribute;
 
 void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 #ifdef CONFIG_GENERIC_MSI_IRQ
@@ -129,6 +129,7 @@ struct pci_msi_desc {
  * @dev:	Pointer to the device which uses this descriptor
  * @msg:	The last set MSI message cached for reuse
  * @affinity:	Optional pointer to a cpu affinity mask for this descriptor
+ * @sysfs_attr:	Pointer to sysfs device attribute
  *
  * @write_msi_msg:	Callback that may be called when the MSI message
  *			address or data changes
@@ -148,6 +149,9 @@ struct msi_desc {
 #ifdef CONFIG_IRQ_MSI_IOMMU
 	const void			*iommu_cookie;
 #endif
+#ifdef CONFIG_SYSFS
+	struct device_attribute		*sysfs_attrs;
+#endif
 
 	void (*write_msi_msg)(struct msi_desc *entry, void *data);
 	void *write_msi_msg_data;
@@ -171,7 +175,6 @@ enum msi_desc_filter {
 /**
  * msi_device_data - MSI per device data
  * @properties:		MSI properties which are interesting to drivers
- * @attrs:		Pointer to the sysfs attribute group
  * @platform_data:	Platform-MSI specific data
  * @list:		List of MSI descriptors associated to the device
  * @mutex:		Mutex protecting the MSI list
@@ -179,7 +182,6 @@ enum msi_desc_filter {
  */
 struct msi_device_data {
 	unsigned long			properties;
-	const struct attribute_group    **attrs;
 	struct platform_msi_priv_data	*platform_data;
 	struct list_head		list;
 	struct mutex			mutex;
@@ -264,14 +266,6 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void pci_msi_mask_irq(struct irq_data *data);
 void pci_msi_unmask_irq(struct irq_data *data);
 
-#ifdef CONFIG_SYSFS
-int msi_device_populate_sysfs(struct device *dev);
-void msi_device_destroy_sysfs(struct device *dev);
-#else /* CONFIG_SYSFS */
-static inline int msi_device_populate_sysfs(struct device *dev) { return 0; }
-static inline void msi_device_destroy_sysfs(struct device *dev) { }
-#endif /* !CONFIG_SYSFS */
-
 /*
  * The arch hooks to setup up msi irqs. Default functions are implemented
  * as weak symbols so that they /can/ be overriden by architecture specific
@@ -285,6 +279,13 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
 void arch_teardown_msi_irq(unsigned int irq);
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void arch_teardown_msi_irqs(struct pci_dev *dev);
+#ifdef CONFIG_SYSFS
+int msi_device_populate_sysfs(struct device *dev);
+void msi_device_destroy_sysfs(struct device *dev);
+#else /* CONFIG_SYSFS */
+static inline int msi_device_populate_sysfs(struct device *dev) { return 0; }
+static inline void msi_device_destroy_sysfs(struct device *dev) { }
+#endif /* !CONFIG_SYSFS */
 #endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */
 
 /*
-- 
cgit v1.2.3


From cd6cf06590b9792340dceaa285138777f3cc4d90 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Dec 2021 23:51:52 +0100
Subject: genirq/msi: Convert storage to xarray

The current linked list storage for MSI descriptors is suboptimal in
several ways:

  1) Looking up a MSI desciptor requires a O(n) list walk in the worst case

  2) The upcoming support of runtime expansion of MSI-X vectors would need
     to do a full list walk to figure out whether a particular index is
     already associated.

  3) Runtime expansion of sparse allocations is even more complex as the
     current implementation assumes an ordered list (increasing MSI index).

Use an xarray which solves all of the above problems nicely.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nishanth Menon <nm@ti.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20211206210749.280627070@linutronix.de
---
 include/linux/msi.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 1a00367d2cfa..fc918a658d48 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -17,6 +17,7 @@
  */
 
 #include <linux/cpumask.h>
+#include <linux/xarray.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <asm/msi.h>
@@ -123,7 +124,6 @@ struct pci_msi_desc {
 
 /**
  * struct msi_desc - Descriptor structure for MSI based interrupts
- * @list:	List head for management
  * @irq:	The base interrupt number
  * @nvec_used:	The number of vectors used
  * @dev:	Pointer to the device which uses this descriptor
@@ -140,7 +140,6 @@ struct pci_msi_desc {
  */
 struct msi_desc {
 	/* Shared device/bus type independent data */
-	struct list_head		list;
 	unsigned int			irq;
 	unsigned int			nvec_used;
 	struct device			*dev;
@@ -176,16 +175,16 @@ enum msi_desc_filter {
  * msi_device_data - MSI per device data
  * @properties:		MSI properties which are interesting to drivers
  * @platform_data:	Platform-MSI specific data
- * @list:		List of MSI descriptors associated to the device
- * @mutex:		Mutex protecting the MSI list
- * @__next:		Cached pointer to the next entry for iterators
+ * @mutex:		Mutex protecting the MSI descriptor store
+ * @__store:		Xarray for storing MSI descriptor pointers
+ * @__iter_idx:		Index to search the next entry for iterators
  */
 struct msi_device_data {
 	unsigned long			properties;
 	struct platform_msi_priv_data	*platform_data;
-	struct list_head		list;
 	struct mutex			mutex;
-	struct msi_desc			*__next;
+	struct xarray			__store;
+	unsigned long			__iter_idx;
 };
 
 int msi_setup_device_data(struct device *dev);
-- 
cgit v1.2.3


From 60f20d84dc813f1342771a3e4f06d89da26dc412 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 18 Nov 2021 12:12:10 -0600
Subject: of/fdt: Rework early_init_dt_scan_chosen() to call directly

Use of the of_scan_flat_dt() function predates libfdt and is discouraged
as libfdt provides a nicer set of APIs. Rework
early_init_dt_scan_chosen() to be called directly and use libfdt.

Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
Link: https://lore.kernel.org/r/20211118181213.1433346-2-robh@kernel.org
---
 include/linux/of_fdt.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index cf48983d3c86..654722235df6 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -58,8 +58,7 @@ extern int of_flat_dt_is_compatible(unsigned long node, const char *name);
 extern unsigned long of_get_flat_dt_root(void);
 extern uint32_t of_get_flat_dt_phandle(unsigned long node);
 
-extern int early_init_dt_scan_chosen(unsigned long node, const char *uname,
-				     int depth, void *data);
+extern int early_init_dt_scan_chosen(char *cmdline);
 extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
 				     int depth, void *data);
 extern int early_init_dt_scan_chosen_stdout(void);
-- 
cgit v1.2.3


From d665881d2171b62ca1ea23be89be6f2a8a330bb2 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 18 Nov 2021 12:12:11 -0600
Subject: of/fdt: Rework early_init_dt_scan_root() to call directly

Use of the of_scan_flat_dt() function predates libfdt and is discouraged
as libfdt provides a nicer set of APIs. Rework early_init_dt_scan_root()
to be called directly and use libfdt.

Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
Link: https://lore.kernel.org/r/20211118181213.1433346-3-robh@kernel.org
---
 include/linux/of_fdt.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 654722235df6..df3d31926c3c 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -68,8 +68,7 @@ extern void early_init_dt_add_memory_arch(u64 base, u64 size);
 extern u64 dt_mem_next_cell(int s, const __be32 **cellp);
 
 /* Early flat tree scan hooks */
-extern int early_init_dt_scan_root(unsigned long node, const char *uname,
-				   int depth, void *data);
+extern int early_init_dt_scan_root(void);
 
 extern bool early_init_dt_scan(void *params);
 extern bool early_init_dt_verify(void *params);
-- 
cgit v1.2.3


From 1f012283e9360fb4007308f04cfaeb205e34b684 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 15 Dec 2021 09:01:02 -0600
Subject: of/fdt: Rework early_init_dt_scan_memory() to call directly

Use of the of_scan_flat_dt() function predates libfdt and is discouraged
as libfdt provides a nicer set of APIs. Rework
early_init_dt_scan_memory() to be called directly and use libfdt.

Cc: John Crispin <john@phrozen.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: linux-mips@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20211215150102.1303588-1-robh@kernel.org
---
 include/linux/of_fdt.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index df3d31926c3c..914739f3c192 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -59,8 +59,7 @@ extern unsigned long of_get_flat_dt_root(void);
 extern uint32_t of_get_flat_dt_phandle(unsigned long node);
 
 extern int early_init_dt_scan_chosen(char *cmdline);
-extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
-				     int depth, void *data);
+extern int early_init_dt_scan_memory(void);
 extern int early_init_dt_scan_chosen_stdout(void);
 extern void early_init_fdt_scan_reserved_mem(void);
 extern void early_init_fdt_reserve_self(void);
-- 
cgit v1.2.3


From f7ea534a0920dbaf71a8003936e178e14ec9271d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 15 Dec 2021 18:55:36 -0800
Subject: add includes masked by cgroup -> bpf dependency

cgroup pulls in BPF which pulls in a lot of includes.
We're about to break that chain so fix those who were
depending on it.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211216025538.1649516-2-kuba@kernel.org
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0dcfd265beed..4a021149eaf0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -611,6 +611,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_SCHED_CB	0x20
 #define PERF_ATTACH_CHILD	0x40
 
+struct bpf_prog;
 struct perf_cgroup;
 struct perf_buffer;
 
-- 
cgit v1.2.3


From fd1740b6abac39f68ce12e201697f106e0f1d519 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 15 Dec 2021 18:55:38 -0800
Subject: bpf: Remove the cgroup -> bpf header dependecy

Remove the dependency from cgroup-defs.h to bpf-cgroup.h and bpf.h.
This reduces the incremental build size of x86 allmodconfig after
bpf.h was touched from ~17k objects rebuilt to ~5k objects.
bpf.h is 2.2kLoC and is modified relatively often.

We need a new header with just the definition of struct cgroup_bpf
and enum cgroup_bpf_attach_type, this is akin to cgroup-defs.h.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/bpf/20211216025538.1649516-4-kuba@kernel.org
---
 include/linux/bpf-cgroup-defs.h | 70 +++++++++++++++++++++++++++++++++++++++++
 include/linux/bpf-cgroup.h      | 57 +--------------------------------
 include/linux/cgroup-defs.h     |  2 +-
 3 files changed, 72 insertions(+), 57 deletions(-)
 create mode 100644 include/linux/bpf-cgroup-defs.h

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
new file mode 100644
index 000000000000..695d1224a71b
--- /dev/null
+++ b/include/linux/bpf-cgroup-defs.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BPF_CGROUP_DEFS_H
+#define _BPF_CGROUP_DEFS_H
+
+#ifdef CONFIG_CGROUP_BPF
+
+#include <linux/list.h>
+#include <linux/percpu-refcount.h>
+#include <linux/workqueue.h>
+
+struct bpf_prog_array;
+
+enum cgroup_bpf_attach_type {
+	CGROUP_BPF_ATTACH_TYPE_INVALID = -1,
+	CGROUP_INET_INGRESS = 0,
+	CGROUP_INET_EGRESS,
+	CGROUP_INET_SOCK_CREATE,
+	CGROUP_SOCK_OPS,
+	CGROUP_DEVICE,
+	CGROUP_INET4_BIND,
+	CGROUP_INET6_BIND,
+	CGROUP_INET4_CONNECT,
+	CGROUP_INET6_CONNECT,
+	CGROUP_INET4_POST_BIND,
+	CGROUP_INET6_POST_BIND,
+	CGROUP_UDP4_SENDMSG,
+	CGROUP_UDP6_SENDMSG,
+	CGROUP_SYSCTL,
+	CGROUP_UDP4_RECVMSG,
+	CGROUP_UDP6_RECVMSG,
+	CGROUP_GETSOCKOPT,
+	CGROUP_SETSOCKOPT,
+	CGROUP_INET4_GETPEERNAME,
+	CGROUP_INET6_GETPEERNAME,
+	CGROUP_INET4_GETSOCKNAME,
+	CGROUP_INET6_GETSOCKNAME,
+	CGROUP_INET_SOCK_RELEASE,
+	MAX_CGROUP_BPF_ATTACH_TYPE
+};
+
+struct cgroup_bpf {
+	/* array of effective progs in this cgroup */
+	struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
+
+	/* attached progs to this cgroup and attach flags
+	 * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
+	 * have either zero or one element
+	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
+	 */
+	struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
+	u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
+
+	/* list of cgroup shared storages */
+	struct list_head storages;
+
+	/* temp storage for effective prog array used by prog_attach/detach */
+	struct bpf_prog_array *inactive;
+
+	/* reference counter used to detach bpf programs after cgroup removal */
+	struct percpu_ref refcnt;
+
+	/* cgroup_bpf is released using a work queue */
+	struct work_struct release_work;
+};
+
+#else /* CONFIG_CGROUP_BPF */
+struct cgroup_bpf {};
+#endif /* CONFIG_CGROUP_BPF */
+
+#endif
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 11820a430d6c..b525d8cdc25b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -3,10 +3,10 @@
 #define _BPF_CGROUP_H
 
 #include <linux/bpf.h>
+#include <linux/bpf-cgroup-defs.h>
 #include <linux/errno.h>
 #include <linux/jump_label.h>
 #include <linux/percpu.h>
-#include <linux/percpu-refcount.h>
 #include <linux/rbtree.h>
 #include <uapi/linux/bpf.h>
 
@@ -23,33 +23,6 @@ struct ctl_table_header;
 struct task_struct;
 
 #ifdef CONFIG_CGROUP_BPF
-enum cgroup_bpf_attach_type {
-	CGROUP_BPF_ATTACH_TYPE_INVALID = -1,
-	CGROUP_INET_INGRESS = 0,
-	CGROUP_INET_EGRESS,
-	CGROUP_INET_SOCK_CREATE,
-	CGROUP_SOCK_OPS,
-	CGROUP_DEVICE,
-	CGROUP_INET4_BIND,
-	CGROUP_INET6_BIND,
-	CGROUP_INET4_CONNECT,
-	CGROUP_INET6_CONNECT,
-	CGROUP_INET4_POST_BIND,
-	CGROUP_INET6_POST_BIND,
-	CGROUP_UDP4_SENDMSG,
-	CGROUP_UDP6_SENDMSG,
-	CGROUP_SYSCTL,
-	CGROUP_UDP4_RECVMSG,
-	CGROUP_UDP6_RECVMSG,
-	CGROUP_GETSOCKOPT,
-	CGROUP_SETSOCKOPT,
-	CGROUP_INET4_GETPEERNAME,
-	CGROUP_INET6_GETPEERNAME,
-	CGROUP_INET4_GETSOCKNAME,
-	CGROUP_INET6_GETSOCKNAME,
-	CGROUP_INET_SOCK_RELEASE,
-	MAX_CGROUP_BPF_ATTACH_TYPE
-};
 
 #define CGROUP_ATYPE(type) \
 	case BPF_##type: return type
@@ -127,33 +100,6 @@ struct bpf_prog_list {
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 };
 
-struct bpf_prog_array;
-
-struct cgroup_bpf {
-	/* array of effective progs in this cgroup */
-	struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
-
-	/* attached progs to this cgroup and attach flags
-	 * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
-	 * have either zero or one element
-	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
-	 */
-	struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
-	u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
-
-	/* list of cgroup shared storages */
-	struct list_head storages;
-
-	/* temp storage for effective prog array used by prog_attach/detach */
-	struct bpf_prog_array *inactive;
-
-	/* reference counter used to detach bpf programs after cgroup removal */
-	struct percpu_ref refcnt;
-
-	/* cgroup_bpf is released using a work queue */
-	struct work_struct release_work;
-};
-
 int cgroup_bpf_inherit(struct cgroup *cgrp);
 void cgroup_bpf_offline(struct cgroup *cgrp);
 
@@ -451,7 +397,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr);
 #else
 
-struct cgroup_bpf {};
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index db2e147e069f..411684c80cf3 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -19,7 +19,7 @@
 #include <linux/percpu-rwsem.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/workqueue.h>
-#include <linux/bpf-cgroup.h>
+#include <linux/bpf-cgroup-defs.h>
 #include <linux/psi_types.h>
 
 #ifdef CONFIG_CGROUPS
-- 
cgit v1.2.3


From 0f55f9ed21f96630c6ec96805d42f92c0b458b37 Mon Sep 17 00:00:00 2001
From: Christy Lee <christylee@fb.com>
Date: Thu, 16 Dec 2021 13:33:56 -0800
Subject: bpf: Only print scratched registers and stack slots to verifier logs.

When printing verifier state for any log level, print full verifier
state only on function calls or on errors. Otherwise, only print the
registers and stack slots that were accessed.

Log size differences:

verif_scale_loop6 before: 234566564
verif_scale_loop6 after: 72143943
69% size reduction

kfree_skb before: 166406
kfree_skb after: 55386
69% size reduction

Before:

156: (61) r0 = *(u32 *)(r1 +0)
157: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=ctx(id=0,off=0,imm=0) R2_w=invP0 R10=fp0 fp-8_w=00000000 fp-16_w=00\
000000 fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 fp-56_w=00000000 fp-64_w=00000000 fp-72_w=00000000 fp-80_w=00000\
000 fp-88_w=00000000 fp-96_w=00000000 fp-104_w=00000000 fp-112_w=00000000 fp-120_w=00000000 fp-128_w=00000000 fp-136_w=00000000 fp-144_w=00\
000000 fp-152_w=00000000 fp-160_w=00000000 fp-168_w=00000000 fp-176_w=00000000 fp-184_w=00000000 fp-192_w=00000000 fp-200_w=00000000 fp-208\
_w=00000000 fp-216_w=00000000 fp-224_w=00000000 fp-232_w=00000000 fp-240_w=00000000 fp-248_w=00000000 fp-256_w=00000000 fp-264_w=00000000 f\
p-272_w=00000000 fp-280_w=00000000 fp-288_w=00000000 fp-296_w=00000000 fp-304_w=00000000 fp-312_w=00000000 fp-320_w=00000000 fp-328_w=00000\
000 fp-336_w=00000000 fp-344_w=00000000 fp-352_w=00000000 fp-360_w=00000000 fp-368_w=00000000 fp-376_w=00000000 fp-384_w=00000000 fp-392_w=\
00000000 fp-400_w=00000000 fp-408_w=00000000 fp-416_w=00000000 fp-424_w=00000000 fp-432_w=00000000 fp-440_w=00000000 fp-448_w=00000000
; return skb->len;
157: (95) exit
Func#4 is safe for any args that match its prototype
Validating get_constant() func#5...
158: R1=invP(id=0) R10=fp0
; int get_constant(long val)
158: (bf) r0 = r1
159: R0_w=invP(id=1) R1=invP(id=1) R10=fp0
; return val - 122;
159: (04) w0 += -122
160: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=invP(id=1) R10=fp0
; return val - 122;
160: (95) exit
Func#5 is safe for any args that match its prototype
Validating get_skb_ifindex() func#6...
161: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0
; int get_skb_ifindex(int val, struct __sk_buff *skb, int var)
161: (bc) w0 = w3
162: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0

After:

156: (61) r0 = *(u32 *)(r1 +0)
157: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=ctx(id=0,off=0,imm=0)
; return skb->len;
157: (95) exit
Func#4 is safe for any args that match its prototype
Validating get_constant() func#5...
158: R1=invP(id=0) R10=fp0
; int get_constant(long val)
158: (bf) r0 = r1
159: R0_w=invP(id=1) R1=invP(id=1)
; return val - 122;
159: (04) w0 += -122
160: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
; return val - 122;
160: (95) exit
Func#5 is safe for any args that match its prototype
Validating get_skb_ifindex() func#6...
161: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0
; int get_skb_ifindex(int val, struct __sk_buff *skb, int var)
161: (bc) w0 = w3
162: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R3=invP(id=0)

Signed-off-by: Christy Lee <christylee@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211216213358.3374427-2-christylee@fb.com
---
 include/linux/bpf_verifier.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 182b16a91084..c66f238c538d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -474,6 +474,13 @@ struct bpf_verifier_env {
 	/* longest register parentage chain walked for liveness marking */
 	u32 longest_mark_read_walk;
 	bpfptr_t fd_array;
+
+	/* bit mask to keep track of whether a register has been accessed
+	 * since the last time the function state was printed
+	 */
+	u32 scratched_regs;
+	/* Same as scratched_regs but for stack slots */
+	u64 scratched_stack_slots;
 };
 
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
-- 
cgit v1.2.3


From 2e5766483c8c5cf886b4dc647a1741738dde7d79 Mon Sep 17 00:00:00 2001
From: Christy Lee <christylee@fb.com>
Date: Thu, 16 Dec 2021 19:42:45 -0800
Subject: bpf: Right align verifier states in verifier logs.

Make the verifier logs more readable, print the verifier states
on the corresponding instruction line. If the previous line was
not a bpf instruction, then print the verifier states on its own
line.

Before:

Validating test_pkt_access_subprog3() func#3...
86: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R10=fp0
; int test_pkt_access_subprog3(int val, struct __sk_buff *skb)
86: (bf) r6 = r2
87: R2=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0)
87: (bc) w7 = w1
88: R1=invP(id=0) R7_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123));
88: (bf) r1 = r6
89: R1_w=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0)
89: (85) call pc+9
Func#4 is global and valid. Skipping.
90: R0_w=invP(id=0)
90: (bc) w8 = w0
91: R0_w=invP(id=0) R8_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123));
91: (b7) r1 = 123
92: R1_w=invP123
92: (85) call pc+65
Func#5 is global and valid. Skipping.
93: R0=invP(id=0)

After:

86: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R10=fp0
; int test_pkt_access_subprog3(int val, struct __sk_buff *skb)
86: (bf) r6 = r2                      ; R2=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0)
87: (bc) w7 = w1                      ; R1=invP(id=0) R7_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123));
88: (bf) r1 = r6                      ; R1_w=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0)
89: (85) call pc+9
Func#4 is global and valid. Skipping.
90: R0_w=invP(id=0)
90: (bc) w8 = w0                      ; R0_w=invP(id=0) R8_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123));
91: (b7) r1 = 123                     ; R1_w=invP123
92: (85) call pc+65
Func#5 is global and valid. Skipping.
93: R0=invP(id=0)

Signed-off-by: Christy Lee <christylee@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c66f238c538d..ee931398f311 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -388,6 +388,8 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 #define BPF_LOG_LEVEL	(BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
 #define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS)
 #define BPF_LOG_KERNEL	(BPF_LOG_MASK + 1) /* kernel internal flag */
+#define BPF_LOG_MIN_ALIGNMENT 8U
+#define BPF_LOG_ALIGNMENT 40U
 
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
@@ -481,6 +483,7 @@ struct bpf_verifier_env {
 	u32 scratched_regs;
 	/* Same as scratched_regs but for stack slots */
 	u64 scratched_stack_slots;
+	u32 prev_log_len, prev_insn_print_len;
 };
 
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
-- 
cgit v1.2.3


From 03de6b273805b3c552ff158f8688555937375926 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 22 Nov 2021 23:22:00 +0100
Subject: dmaengine: qcom-adm: stop abusing slave_id config

The slave_id was previously used to pick one DMA slave instead of another,
but this is now done through the DMA descriptors in device tree.

For the qcom_adm driver, the configuration is documented in the DT
binding to contain a tuple of device identifier and a "crci" field,
but the implementation ends up using only a single cell for identifying
the slave, with the crci getting passed in nonstandard properties of
the device, and passed through the dma driver using the old slave_id
field. Part of the problem apparently is that the nand driver ends up
using only a single DMA request ID, but requires distinct values for
"crci" depending on the type of transfer.

Change both the dmaengine driver and the two slave drivers to allow
the documented binding to work in addition to the ad-hoc passing
of crci values. In order to no longer abuse the slave_id field, pass
the data using the "peripheral_config" mechanism instead.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211122222203.4103644-9-arnd@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/qcom_adm.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 include/linux/dma/qcom_adm.h

(limited to 'include/linux')

diff --git a/include/linux/dma/qcom_adm.h b/include/linux/dma/qcom_adm.h
new file mode 100644
index 000000000000..af20df674f0c
--- /dev/null
+++ b/include/linux/dma/qcom_adm.h
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef __LINUX_DMA_QCOM_ADM_H
+#define __LINUX_DMA_QCOM_ADM_H
+
+#include <linux/types.h>
+
+struct qcom_adm_peripheral_config {
+	u32 crci;
+	u32 mux;
+};
+
+#endif /* __LINUX_DMA_QCOM_ADM_H */
-- 
cgit v1.2.3


From 93cdb5b0dc56cc7a8b87a61146495f3bdc93d7ba Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 22 Nov 2021 23:22:01 +0100
Subject: dmaengine: xilinx_dpdma: stop using slave_id field

The display driver wants to pass a custom flag to the DMA engine driver,
which it started doing by using the slave_id field that was traditionally
used for a different purpose.

As there is no longer a correct use for the slave_id field, it should
really be removed, and the remaining users changed over to something
different.

The new mechanism for passing nonstandard settings is using the
.peripheral_config field, so use that to pass a newly defined structure
here, making it clear that this will not work in portable drivers.

Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211122222203.4103644-10-arnd@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/xilinx_dpdma.h | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 include/linux/dma/xilinx_dpdma.h

(limited to 'include/linux')

diff --git a/include/linux/dma/xilinx_dpdma.h b/include/linux/dma/xilinx_dpdma.h
new file mode 100644
index 000000000000..83a1377f03f8
--- /dev/null
+++ b/include/linux/dma/xilinx_dpdma.h
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __LINUX_DMA_XILINX_DPDMA_H
+#define __LINUX_DMA_XILINX_DPDMA_H
+
+#include <linux/types.h>
+
+struct xilinx_dpdma_peripheral_config {
+	bool video_group;
+};
+
+#endif /* __LINUX_DMA_XILINX_DPDMA_H */
-- 
cgit v1.2.3


From 3c219644075795a99271d345efdfa8b256e55161 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 22 Nov 2021 23:22:03 +0100
Subject: dmaengine: remove slave_id config field

All references to the slave_id field have been removed, so remove the
field as well to prevent new references from creeping in again.

Originally this allowed slave DMA drivers to configure which device
is accessed with the dmaengine_slave_config() call, but this was
inconsistent, as the same information is also passed while requesting
a channel, and never changes in practice.

In modern kernels, the device is always selected when requesting
the channel, so the .slave_id field is no longer useful.

Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211122222203.4103644-12-arnd@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 9000f3ffce8b..0349b35235e6 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -418,9 +418,6 @@ enum dma_slave_buswidth {
  * @device_fc: Flow Controller Settings. Only valid for slave channels. Fill
  * with 'true' if peripheral should be flow controller. Direction will be
  * selected at Runtime.
- * @slave_id: Slave requester id. Only valid for slave channels. The dma
- * slave peripheral will have unique id as dma requester which need to be
- * pass as slave config.
  * @peripheral_config: peripheral configuration for programming peripheral
  * for dmaengine transfer
  * @peripheral_size: peripheral configuration buffer size
@@ -448,7 +445,6 @@ struct dma_slave_config {
 	u32 src_port_window_size;
 	u32 dst_port_window_size;
 	bool device_fc;
-	unsigned int slave_id;
 	void *peripheral_config;
 	size_t peripheral_size;
 };
-- 
cgit v1.2.3


From 3d725965f836a7acbd1674e33644bec18373de53 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 7 Dec 2021 15:33:06 -0800
Subject: crypto: ccp - Add SEV_INIT_EX support

Add new module parameter to allow users to use SEV_INIT_EX instead of
SEV_INIT. This helps users who lock their SPI bus to use the PSP for SEV
functionality. The 'init_ex_path' parameter defaults to NULL which means
the kernel will use SEV_INIT, if a path is specified SEV_INIT_EX will be
used with the data found at the path. On certain PSP commands this
file is written to as the PSP updates the NV memory region. Depending on
file system initialization this file open may fail during module init
but the CCP driver for SEV already has sufficient retries for platform
initialization. During normal operation of PSP system and SEV commands
if the PSP has not been initialized it is at run time. If the file at
'init_ex_path' does not exist the PSP will not be initialized. The user
must create the file prior to use with 32Kb of 0xFFs per spec.

Signed-off-by: David Rientjes <rientjes@google.com>
Co-developed-by: Peter Gonda <pgonda@google.com>
Signed-off-by: Peter Gonda <pgonda@google.com>
Reviewed-by: Marc Orr <marcorr@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Brijesh Singh <brijesh.singh@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Marc Orr <marcorr@google.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David Rientjes <rientjes@google.com>
Cc: John Allen <john.allen@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/psp-sev.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index d48a7192e881..1595088c428b 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -52,6 +52,7 @@ enum sev_cmd {
 	SEV_CMD_DF_FLUSH		= 0x00A,
 	SEV_CMD_DOWNLOAD_FIRMWARE	= 0x00B,
 	SEV_CMD_GET_ID			= 0x00C,
+	SEV_CMD_INIT_EX                 = 0x00D,
 
 	/* Guest commands */
 	SEV_CMD_DECOMMISSION		= 0x020,
@@ -102,6 +103,26 @@ struct sev_data_init {
 	u32 tmr_len;			/* In */
 } __packed;
 
+/**
+ * struct sev_data_init_ex - INIT_EX command parameters
+ *
+ * @length: len of the command buffer read by the PSP
+ * @flags: processing flags
+ * @tmr_address: system physical address used for SEV-ES
+ * @tmr_len: len of tmr_address
+ * @nv_address: system physical address used for PSP NV storage
+ * @nv_len: len of nv_address
+ */
+struct sev_data_init_ex {
+	u32 length;                     /* In */
+	u32 flags;                      /* In */
+	u64 tmr_address;                /* In */
+	u32 tmr_len;                    /* In */
+	u32 reserved;                   /* In */
+	u64 nv_address;                 /* In/Out */
+	u32 nv_len;                     /* In */
+} __packed;
+
 #define SEV_INIT_FLAGS_SEV_ES	0x01
 
 /**
-- 
cgit v1.2.3


From 9dfa5b6f5efb85efe69fd3b7b0b912004d9547f1 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 16 Dec 2021 09:17:03 +0800
Subject: iommu/vt-d: Remove unused macros

These macros has no reference in the tree anymore. Cleanup them.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211216011703.763331-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-svm.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 57cceecbe37f..1b73bab7eeff 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -8,12 +8,6 @@
 #ifndef __INTEL_SVM_H__
 #define __INTEL_SVM_H__
 
-/* Values for rxwp in fault_cb callback */
-#define SVM_REQ_READ	(1<<3)
-#define SVM_REQ_WRITE	(1<<2)
-#define SVM_REQ_EXEC	(1<<1)
-#define SVM_REQ_PRIV	(1<<0)
-
 /* Page Request Queue depth */
 #define PRQ_ORDER	2
 #define PRQ_RING_MASK	((0x1000 << PRQ_ORDER) - 0x20)
-- 
cgit v1.2.3


From b62e3317b68d9c84301940ca8ca9c35a584111b2 Mon Sep 17 00:00:00 2001
From: Xiang wangx <wangxiang@cdjrlc.com>
Date: Thu, 16 Dec 2021 23:19:16 +0800
Subject: net: fix typo in a comment

The double 'as' in a comment is repeated, thus it should be removed.

Signed-off-by: Xiang wangx <wangxiang@cdjrlc.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index be5cb3360b94..6aadcc0ecb5b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1937,7 +1937,7 @@ enum netdev_ml_priv_type {
  *	@udp_tunnel_nic:	UDP tunnel offload state
  *	@xdp_state:		stores info on attached XDP BPF programs
  *
- *	@nested_level:	Used as as a parameter of spin_lock_nested() of
+ *	@nested_level:	Used as a parameter of spin_lock_nested() of
  *			dev->addr_list_lock.
  *	@unlink_list:	As netif_addr_lock() can be called recursively,
  *			keep a list of interfaces to be deleted.
-- 
cgit v1.2.3


From dd61b29207ca4f346fbd9c06bc49f093e3369185 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Tue, 7 Dec 2021 10:34:06 +0100
Subject: gpiolib: provide gpiod_remove_hogs()

Currently all users of gpiod_add_hogs() call it only once at system
init so there never was any need for a mechanism allowing to remove
them. Now the upcoming gpio-sim will need to tear down chips with hogged
lines so provide a function that allows to remove hogs.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/machine.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h
index d755e529c1e3..2647dd10b541 100644
--- a/include/linux/gpio/machine.h
+++ b/include/linux/gpio/machine.h
@@ -100,6 +100,7 @@ void gpiod_add_lookup_table(struct gpiod_lookup_table *table);
 void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n);
 void gpiod_remove_lookup_table(struct gpiod_lookup_table *table);
 void gpiod_add_hogs(struct gpiod_hog *hogs);
+void gpiod_remove_hogs(struct gpiod_hog *hogs);
 #else /* ! CONFIG_GPIOLIB */
 static inline
 void gpiod_add_lookup_table(struct gpiod_lookup_table *table) {}
@@ -108,6 +109,7 @@ void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n) {}
 static inline
 void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) {}
 static inline void gpiod_add_hogs(struct gpiod_hog *hogs) {}
+static inline void gpiod_remove_hogs(struct gpiod_hog *hogs) {}
 #endif /* CONFIG_GPIOLIB */
 
 #endif /* __LINUX_GPIO_MACHINE_H */
-- 
cgit v1.2.3


From 990f6756bb64756d2d1033118cded6333b43397d Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Mon, 13 Dec 2021 11:16:41 +0100
Subject: gpiolib: allow to specify the firmware node in struct gpio_chip

Software nodes allow us to represent hierarchies for device components
that don't have their struct device representation yet - for instance:
banks of GPIOs under a common GPIO expander. The core gpiolib core
however doesn't offer any way of passing this information from the
drivers.

This extends struct gpio_chip with a pointer to fwnode that can be set
by the driver and used to pass device properties for child nodes.

This is similar to how we handle device-tree sub-nodes with
CONFIG_OF_GPIO enabled.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index a673a359e20b..b0728c8ad90c 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -289,6 +289,7 @@ struct gpio_irq_chip {
  *	number or the name of the SoC IP-block implementing it.
  * @gpiodev: the internal state holder, opaque struct
  * @parent: optional parent device providing the GPIOs
+ * @fwnode: optional fwnode providing this controller's properties
  * @owner: helps prevent removal of modules exporting active GPIOs
  * @request: optional hook for chip-specific activation, such as
  *	enabling module power and clock; may sleep
@@ -377,6 +378,7 @@ struct gpio_chip {
 	const char		*label;
 	struct gpio_device	*gpiodev;
 	struct device		*parent;
+	struct fwnode_handle	*fwnode;
 	struct module		*owner;
 
 	int			(*request)(struct gpio_chip *gc,
-- 
cgit v1.2.3


From c06ef740d401d0f4ab188882bf6f8d9cf0f75eaf Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Tue, 7 Dec 2021 00:20:59 +0000
Subject: PM: core: Redefine pm_ptr() macro

The pm_ptr() macro was previously conditionally defined, according to
the value of the CONFIG_PM option. This meant that the pointed structure
was either referenced (if CONFIG_PM was set), or never referenced (if
CONFIG_PM was not set), causing it to be detected as unused by the
compiler.

This worked fine, but required the __maybe_unused compiler attribute to
be used to every symbol pointed to by a pointer wrapped with pm_ptr().

We can do better. With this change, the pm_ptr() is now defined the
same, independently of the value of CONFIG_PM. It now uses the (?:)
ternary operator to conditionally resolve to its argument. Since the
condition is known at compile time, the compiler will then choose to
discard the unused symbols, which won't need to be tagged with
__maybe_unused anymore.

This pm_ptr() macro is usually used with pointers to dev_pm_ops
structures created with SIMPLE_DEV_PM_OPS() or similar macros. These do
use a __maybe_unused flag, which is now useless with this change, so it
later can be removed. However in the meantime it causes no harm, and all
the drivers still compile fine with the new pm_ptr() macro.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 1d8209c09686..b88ac7dcf2a2 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -373,11 +373,7 @@ const struct dev_pm_ops __maybe_unused name = { \
 	SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
 }
 
-#ifdef CONFIG_PM
-#define pm_ptr(_ptr) (_ptr)
-#else
-#define pm_ptr(_ptr) NULL
-#endif
+#define pm_ptr(_ptr) PTR_IF(IS_ENABLED(CONFIG_PM), (_ptr))
 
 /*
  * PM_EVENT_ messages
-- 
cgit v1.2.3


From 1a3c7bb088266fa2db017be299f91f1c1894c857 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Tue, 7 Dec 2021 00:21:00 +0000
Subject: PM: core: Add new *_PM_OPS macros, deprecate old ones

This commit introduces the following macros:

SYSTEM_SLEEP_PM_OPS()
LATE_SYSTEM_SLEEP_PM_OPS()
NOIRQ_SYSTEM_SLEEP_PM_OPS()
RUNTIME_PM_OPS()

These new macros are very similar to their SET_*_PM_OPS() equivalent.
They however differ in the fact that the callbacks they set will always
be seen as referenced by the compiler. This means that the callback
functions don't need to be wrapped with a #ifdef CONFIG_PM guard, or
tagged with __maybe_unused, to prevent the compiler from complaining
about unused static symbols. The compiler will then simply evaluate at
compile time whether or not these symbols are dead code.

The callbacks that are only useful with CONFIG_PM_SLEEP is enabled, are
now also wrapped with a new pm_sleep_ptr() macro, which is inspired from
pm_ptr(). This is needed for drivers that use different callbacks for
sleep and runtime PM, to handle the case where CONFIG_PM is set and
CONFIG_PM_SLEEP is not.

This commit also deprecates the following macros:

SIMPLE_DEV_PM_OPS()
UNIVERSAL_DEV_PM_OPS()

And introduces the following macros:

DEFINE_SIMPLE_DEV_PM_OPS()
DEFINE_UNIVERSAL_DEV_PM_OPS()

These macros are similar to the functions they were created to replace,
with the following differences:

 - They use the new macros introduced above, and as such always
   reference the provided callback functions.

 - They are not tagged with __maybe_unused. They are meant to be used
   with pm_ptr() or pm_sleep_ptr() for DEFINE_UNIVERSAL_DEV_PM_OPS()
   and DEFINE_SIMPLE_DEV_PM_OPS() respectively.

 - They declare the symbol static, since every driver seems to do that
   anyway; and if a non-static use-case is needed an indirection pointer
   could be used.

The point of this change, is to progressively switch from a code model
where PM callbacks are all protected behind CONFIG_PM guards, to a code
model where the PM callbacks are always seen by the compiler, but
discarded if not used.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 74 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 50 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index b88ac7dcf2a2..fc9691cb01b4 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -300,47 +300,59 @@ struct dev_pm_ops {
 	int (*runtime_idle)(struct device *dev);
 };
 
+#define SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+	.suspend = pm_sleep_ptr(suspend_fn), \
+	.resume = pm_sleep_ptr(resume_fn), \
+	.freeze = pm_sleep_ptr(suspend_fn), \
+	.thaw = pm_sleep_ptr(resume_fn), \
+	.poweroff = pm_sleep_ptr(suspend_fn), \
+	.restore = pm_sleep_ptr(resume_fn),
+
+#define LATE_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+	.suspend_late = pm_sleep_ptr(suspend_fn), \
+	.resume_early = pm_sleep_ptr(resume_fn), \
+	.freeze_late = pm_sleep_ptr(suspend_fn), \
+	.thaw_early = pm_sleep_ptr(resume_fn), \
+	.poweroff_late = pm_sleep_ptr(suspend_fn), \
+	.restore_early = pm_sleep_ptr(resume_fn),
+
+#define NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+	.suspend_noirq = pm_sleep_ptr(suspend_fn), \
+	.resume_noirq = pm_sleep_ptr(resume_fn), \
+	.freeze_noirq = pm_sleep_ptr(suspend_fn), \
+	.thaw_noirq = pm_sleep_ptr(resume_fn), \
+	.poweroff_noirq = pm_sleep_ptr(suspend_fn), \
+	.restore_noirq = pm_sleep_ptr(resume_fn),
+
+#define RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
+	.runtime_suspend = suspend_fn, \
+	.runtime_resume = resume_fn, \
+	.runtime_idle = idle_fn,
+
 #ifdef CONFIG_PM_SLEEP
 #define SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
-	.suspend = suspend_fn, \
-	.resume = resume_fn, \
-	.freeze = suspend_fn, \
-	.thaw = resume_fn, \
-	.poweroff = suspend_fn, \
-	.restore = resume_fn,
+	SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn)
 #else
 #define SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn)
 #endif
 
 #ifdef CONFIG_PM_SLEEP
 #define SET_LATE_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
-	.suspend_late = suspend_fn, \
-	.resume_early = resume_fn, \
-	.freeze_late = suspend_fn, \
-	.thaw_early = resume_fn, \
-	.poweroff_late = suspend_fn, \
-	.restore_early = resume_fn,
+	LATE_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn)
 #else
 #define SET_LATE_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn)
 #endif
 
 #ifdef CONFIG_PM_SLEEP
 #define SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
-	.suspend_noirq = suspend_fn, \
-	.resume_noirq = resume_fn, \
-	.freeze_noirq = suspend_fn, \
-	.thaw_noirq = resume_fn, \
-	.poweroff_noirq = suspend_fn, \
-	.restore_noirq = resume_fn,
+	NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn)
 #else
 #define SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn)
 #endif
 
 #ifdef CONFIG_PM
 #define SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
-	.runtime_suspend = suspend_fn, \
-	.runtime_resume = resume_fn, \
-	.runtime_idle = idle_fn,
+	RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn)
 #else
 #define SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn)
 #endif
@@ -349,9 +361,9 @@ struct dev_pm_ops {
  * Use this if you want to use the same suspend and resume callbacks for suspend
  * to RAM and hibernation.
  */
-#define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
-const struct dev_pm_ops __maybe_unused name = { \
-	SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+#define DEFINE_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
+static const struct dev_pm_ops name = { \
+	SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
 }
 
 /*
@@ -367,6 +379,19 @@ const struct dev_pm_ops __maybe_unused name = { \
  * .resume_early(), to the same routines as .runtime_suspend() and
  * .runtime_resume(), respectively (and analogously for hibernation).
  */
+#define DEFINE_UNIVERSAL_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
+static const struct dev_pm_ops name = { \
+	SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+	RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
+}
+
+/* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */
+#define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
+const struct dev_pm_ops __maybe_unused name = { \
+	SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+}
+
+/* Deprecated. Use DEFINE_UNIVERSAL_DEV_PM_OPS() instead. */
 #define UNIVERSAL_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
 const struct dev_pm_ops __maybe_unused name = { \
 	SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
@@ -374,6 +399,7 @@ const struct dev_pm_ops __maybe_unused name = { \
 }
 
 #define pm_ptr(_ptr) PTR_IF(IS_ENABLED(CONFIG_PM), (_ptr))
+#define pm_sleep_ptr(_ptr) PTR_IF(IS_ENABLED(CONFIG_PM_SLEEP), (_ptr))
 
 /*
  * PM_EVENT_ messages
-- 
cgit v1.2.3


From 931da6a0de5d620425af4425344259e6ff46b654 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 7 Dec 2021 21:17:34 +0800
Subject: powercap: intel_rapl: support new layout of Psys PowerLimit Register
 on SPR

On Sapphire Rapids, the layout of the Psys domain Power Limit Register
is different from from what it was before.

Enhance the code to support the new Psys PL register layout.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reported-and-tested-by: Alkattan Dana <dana.alkattan@intel.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/intel_rapl.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 93780834fc8f..9f4b6f5b822f 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -58,6 +58,12 @@ enum rapl_primitives {
 	THROTTLED_TIME,
 	PRIORITY_LEVEL,
 
+	PSYS_POWER_LIMIT1,
+	PSYS_POWER_LIMIT2,
+	PSYS_PL1_ENABLE,
+	PSYS_PL2_ENABLE,
+	PSYS_TIME_WINDOW1,
+	PSYS_TIME_WINDOW2,
 	/* below are not raw primitive data */
 	AVERAGE_POWER,
 	NR_RAPL_PRIMITIVES,
-- 
cgit v1.2.3


From c24efa6732788f0be22cdf5d2aedd5e3117e983f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 7 Dec 2021 19:54:32 +0100
Subject: PM: runtime: Capture device status before disabling runtime PM

In some cases (for example, during system-wide suspend and resume of
devices) it is useful to know whether or not runtime PM has ever been
enabled for a given device and, if so, what the runtime PM status of
it had been right before runtime PM was disabled for it last time.

For this reason, introduce a new struct dev_pm_info field called
last_status that will be used for capturing the runtime PM status of
the device when its power.disable_depth counter changes from 0 to 1.

The new field will be set to RPM_INVALID to start with and whenever
power.disable_depth changes from 1 to 0, so it will be valid only
when runtime PM of the device is currently disabled, but it has been
enabled at least once.

Immediately use power.last_status in rpm_resume() to make it handle
the case when PM runtime is disabled for the device, but its runtime
PM status is RPM_ACTIVE more consistently.  Namely, make it return 1
if power.last_status is also equal to RPM_ACTIVE in that case (the
idea being that if the status was RPM_ACTIVE last time when
power.disable_depth was changing from 0 to 1 and it is still
RPM_ACTIVE, it can be assumed to reflect what happened to the device
last time when it was using runtime PM) and -EACCES otherwise.

Update the documentation to provide a description of last_status and
change the description of pm_runtime_resume() in it to reflect the
new behavior of rpm_active().

While at it, rearrange the code in pm_runtime_enable() to be more
straightforward and replace the WARN() macro in it with a pr_warn()
invocation which is less disruptive.

Link: https://lore.kernel.org/linux-pm/20211026222626.39222-1-ulf.hansson@linaro.org/t/#u
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index fc9691cb01b4..e1e9402180b9 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -521,6 +521,7 @@ const struct dev_pm_ops __maybe_unused name = { \
  */
 
 enum rpm_status {
+	RPM_INVALID = -1,
 	RPM_ACTIVE = 0,
 	RPM_RESUMING,
 	RPM_SUSPENDED,
@@ -634,6 +635,7 @@ struct dev_pm_info {
 	unsigned int		links_count;
 	enum rpm_request	request;
 	enum rpm_status		runtime_status;
+	enum rpm_status		last_status;
 	int			runtime_error;
 	int			autosuspend_delay;
 	u64			last_busy;
-- 
cgit v1.2.3


From d1579e61192e0e686faa4208500ef4c3b529b16c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 10 Dec 2021 17:10:13 +0100
Subject: PM: runtime: Add safety net to supplier device release

Because refcount_dec_not_one() returns true if the target refcount
becomes saturated, it is generally unsafe to use its return value as
a loop termination condition, but that is what happens when a device
link's supplier device is released during runtime PM suspend
operations and on device link removal.

To address this, introduce pm_runtime_release_supplier() to be used
in the above cases which will check the supplier device's runtime
PM usage counter in addition to the refcount_dec_not_one() return
value, so the loop can be terminated in case the rpm_active refcount
value becomes invalid, and update the code in question to use it as
appropriate.

This change is not expected to have any visible functional impact.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/pm_runtime.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index eddd66d426ca..016de5776b6d 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -58,6 +58,7 @@ extern void pm_runtime_get_suppliers(struct device *dev);
 extern void pm_runtime_put_suppliers(struct device *dev);
 extern void pm_runtime_new_link(struct device *dev);
 extern void pm_runtime_drop_link(struct device_link *link);
+extern void pm_runtime_release_supplier(struct device_link *link, bool check_idle);
 
 extern int devm_pm_runtime_enable(struct device *dev);
 
@@ -283,6 +284,8 @@ static inline void pm_runtime_get_suppliers(struct device *dev) {}
 static inline void pm_runtime_put_suppliers(struct device *dev) {}
 static inline void pm_runtime_new_link(struct device *dev) {}
 static inline void pm_runtime_drop_link(struct device_link *link) {}
+static inline void pm_runtime_release_supplier(struct device_link *link,
+					       bool check_idle) {}
 
 #endif /* !CONFIG_PM */
 
-- 
cgit v1.2.3


From 227fee5fc99eeb74d43bf68832f6d59d30ac07d8 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 16 Dec 2021 13:42:25 +0530
Subject: bus: mhi: core: Add an API for auto queueing buffers for DL channel

Add a new API "mhi_prepare_for_transfer_autoqueue" for using with client
drivers like QRTR to request MHI core to autoqueue buffers for the DL
channel along with starting both UL and DL channels.

So far, the "auto_queue" flag specified by the controller drivers in
channel definition served this purpose but this will be removed at some
point in future.

Cc: netdev@vger.kernel.org
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Loic Poulain <loic.poulain@linaro.org>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/20211216081227.237749-9-manivannan.sadhasivam@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mhi.h | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index a5cc4cdf9cc8..a5441ad33c74 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -730,15 +730,26 @@ void mhi_device_put(struct mhi_device *mhi_dev);
 
 /**
  * mhi_prepare_for_transfer - Setup UL and DL channels for data transfer.
- *                            Allocate and initialize the channel context and
- *                            also issue the START channel command to both
- *                            channels. Channels can be started only if both
- *                            host and device execution environments match and
- *                            channels are in a DISABLED state.
  * @mhi_dev: Device associated with the channels
+ *
+ * Allocate and initialize the channel context and also issue the START channel
+ * command to both channels. Channels can be started only if both host and
+ * device execution environments match and channels are in a DISABLED state.
  */
 int mhi_prepare_for_transfer(struct mhi_device *mhi_dev);
 
+/**
+ * mhi_prepare_for_transfer_autoqueue - Setup UL and DL channels with auto queue
+ *                                      buffers for DL traffic
+ * @mhi_dev: Device associated with the channels
+ *
+ * Allocate and initialize the channel context and also issue the START channel
+ * command to both channels. Channels can be started only if both host and
+ * device execution environments match and channels are in a DISABLED state.
+ * The MHI core will automatically allocate and queue buffers for the DL traffic.
+ */
+int mhi_prepare_for_transfer_autoqueue(struct mhi_device *mhi_dev);
+
 /**
  * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer.
  *                               Issue the RESET channel command and let the
-- 
cgit v1.2.3


From 49f39cb0ef198ae3c73765c9b9ee3034e4c9f076 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 1 Dec 2021 14:59:30 +0200
Subject: device property: Fix documentation for FWNODE_GRAPH_DEVICE_DISABLED

FWNODE_GRAPH_DEVICE_DISABLED flag was meant for also returning endpoints
connected to disabled devices, but it also may return endpoints that are
not connected. Fix this in documentation. Also
fwnode_graph_get_endpoint_by_id() was affeced by this.

Also improve the language a little bit.

Fixes: 0fcc2bdc8aff ("device property: Add fwnode_graph_get_endpoint_by_id()")
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/property.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 16f736c698a2..7a2df45ec3ae 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -414,7 +414,8 @@ static inline bool fwnode_graph_is_endpoint(struct fwnode_handle *fwnode)
  *				one.
  * @FWNODE_GRAPH_DEVICE_DISABLED: That the device to which the remote
  *				  endpoint of the given endpoint belongs to,
- *				  may be disabled.
+ *				  may be disabled, or that the endpoint is not
+ *				  connected.
  */
 #define FWNODE_GRAPH_ENDPOINT_NEXT	BIT(0)
 #define FWNODE_GRAPH_DEVICE_DISABLED	BIT(1)
-- 
cgit v1.2.3


From c87b8fc569667610b4891cad1e4a663e5a94d8f8 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 1 Dec 2021 14:59:33 +0200
Subject: device property: Implement fwnode_graph_get_endpoint_count()

Add fwnode_graph_get_endpoint_count() function to provide generic
implementation of of_graph_get_endpoint_count(). The former by default
only counts endpoints to available devices which is consistent with the
rest of the fwnode graph API. By providing FWNODE_GRAPH_DEVICE_DISABLED
flag, also unconnected endpoints and endpoints to disabled devices are
counted.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/property.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 7a2df45ec3ae..8c0104871252 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -423,6 +423,8 @@ static inline bool fwnode_graph_is_endpoint(struct fwnode_handle *fwnode)
 struct fwnode_handle *
 fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode,
 				u32 port, u32 endpoint, unsigned long flags);
+unsigned int fwnode_graph_get_endpoint_count(struct fwnode_handle *fwnode,
+					     unsigned long flags);
 
 #define fwnode_graph_for_each_endpoint(fwnode, child)			\
 	for (child = NULL;						\
-- 
cgit v1.2.3


From c49eea6ffec626c059ace085fce1bf501b05dbc7 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 1 Dec 2021 15:01:15 +0200
Subject: device property: Drop fwnode_graph_get_remote_node()

fwnode_graph_get_remote_node() is only used by the tegra-video driver.
Convert it to use newer fwnode_graph_get_endpoint_by_id() and drop
now-unused fwnode_graph_get_remote_node().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/property.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 8c0104871252..8355f99ebd47 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -397,9 +397,6 @@ struct fwnode_handle *fwnode_graph_get_remote_port(
 	const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_graph_get_remote_endpoint(
 	const struct fwnode_handle *fwnode);
-struct fwnode_handle *
-fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port,
-			     u32 endpoint);
 
 static inline bool fwnode_graph_is_endpoint(struct fwnode_handle *fwnode)
 {
-- 
cgit v1.2.3


From bd0b536dc2e1e9828a85b1e3470ee7bafc3b36f6 Mon Sep 17 00:00:00 2001
From: Brett Creeley <brett.creeley@intel.com>
Date: Mon, 29 Nov 2021 16:15:59 -0800
Subject: virtchnl: Add support for new VLAN capabilities

Currently VIRTCHNL only allows for VLAN filtering and offloads to happen
on a single 802.1Q VLAN. Add support to filter and offload on inner,
outer, and/or inner + outer VLANs.

This is done by introducing the new capability
VIRTCHNL_VF_OFFLOAD_VLAN_V2. The flow to negotiate this new capability
is shown below.

1. VF - sets the VIRTCHNL_VF_OFFLOAD_VLAN_V2 bit in the
   virtchnl_vf_resource.vf_caps_flags during the
   VIRTCHNL_OP_GET_VF_RESOURCES request message. The VF should also set
   the VIRTCHNL_VF_OFFLOAD_VLAN bit in case the PF driver doesn't support
   the new capability.

2. PF - sets the VLAN capability bit it supports in the
   VIRTCHNL_OP_GET_VF_RESOURCES response message. This will either be
   VIRTCHNL_VF_OFFLOAD_VLAN_V2, VIRTCHNL_VF_OFFLOAD_VLAN, or none.

3. VF - If the VIRTCHNL_VF_OFFLOAD_VLAN_V2 capability was ACK'd by the
   PF, then the VF needs to request the VLAN capabilities of the
   PF/Device by issuing a VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS request.
   If the VIRTCHNL_VF_OFFLOAD_VLAN capability was ACK'd then the VF
   knows only single 802.1Q VLAN filtering/offloads are supported. If no
   VLAN capability is ACK'd then the PF/Device doesn't support hardware
   VLAN filtering/offloads for this VF.

4. PF - Populates the virtchnl_vlan_caps structure based on what it
   allows/supports for that VF and sends that response via
   VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS.

After VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS is successfully negotiated
the VF driver needs to interpret the capabilities supported by the
underlying PF/Device. The VF will be allowed to filter/offload the
inner 802.1Q, outer (various ethertype), inner 802.1Q + outer
(various ethertypes), or none based on which fields are set.

The VF will also need to interpret where the VLAN tag should be inserted
and/or stripped based on the negotiated capabilities.

Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 377 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 377 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index b30a1bc74fc7..2ce27e8e4f19 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -141,6 +141,13 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_DEL_RSS_CFG = 46,
 	VIRTCHNL_OP_ADD_FDIR_FILTER = 47,
 	VIRTCHNL_OP_DEL_FDIR_FILTER = 48,
+	VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS = 51,
+	VIRTCHNL_OP_ADD_VLAN_V2 = 52,
+	VIRTCHNL_OP_DEL_VLAN_V2 = 53,
+	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 = 54,
+	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 = 55,
+	VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 = 56,
+	VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2 = 57,
 	VIRTCHNL_OP_MAX,
 };
 
@@ -246,6 +253,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		BIT(6)
 /* used to negotiate communicating link speeds in Mbps */
 #define VIRTCHNL_VF_CAP_ADV_LINK_SPEED		BIT(7)
+#define VIRTCHNL_VF_OFFLOAD_VLAN_V2		BIT(15)
 #define VIRTCHNL_VF_OFFLOAD_VLAN		BIT(16)
 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING		BIT(17)
 #define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2	BIT(18)
@@ -475,6 +483,351 @@ struct virtchnl_vlan_filter_list {
 
 VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_vlan_filter_list);
 
+/* This enum is used for all of the VIRTCHNL_VF_OFFLOAD_VLAN_V2_CAPS related
+ * structures and opcodes.
+ *
+ * VIRTCHNL_VLAN_UNSUPPORTED - This field is not supported and if a VF driver
+ * populates it the PF should return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED.
+ *
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 - This field supports 0x8100 ethertype.
+ * VIRTCHNL_VLAN_ETHERTYPE_88A8 - This field supports 0x88A8 ethertype.
+ * VIRTCHNL_VLAN_ETHERTYPE_9100 - This field supports 0x9100 ethertype.
+ *
+ * VIRTCHNL_VLAN_ETHERTYPE_AND - Used when multiple ethertypes can be supported
+ * by the PF concurrently. For example, if the PF can support
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 AND VIRTCHNL_VLAN_ETHERTYPE_88A8 filters it
+ * would OR the following bits:
+ *
+ *	VIRTHCNL_VLAN_ETHERTYPE_8100 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * The VF would interpret this as VLAN filtering can be supported on both 0x8100
+ * and 0x88A8 VLAN ethertypes.
+ *
+ * VIRTCHNL_ETHERTYPE_XOR - Used when only a single ethertype can be supported
+ * by the PF concurrently. For example if the PF can support
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 XOR VIRTCHNL_VLAN_ETHERTYPE_88A8 stripping
+ * offload it would OR the following bits:
+ *
+ *	VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_XOR;
+ *
+ * The VF would interpret this as VLAN stripping can be supported on either
+ * 0x8100 or 0x88a8 VLAN ethertypes. So when requesting VLAN stripping via
+ * VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 the specified ethertype will override
+ * the previously set value.
+ *
+ * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1 - Used to tell the VF to insert and/or
+ * strip the VLAN tag using the L2TAG1 field of the Tx/Rx descriptors.
+ *
+ * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 - Used to tell the VF to insert hardware
+ * offloaded VLAN tags using the L2TAG2 field of the Tx descriptor.
+ *
+ * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 - Used to tell the VF to strip hardware
+ * offloaded VLAN tags using the L2TAG2_2 field of the Rx descriptor.
+ *
+ * VIRTCHNL_VLAN_PRIO - This field supports VLAN priority bits. This is used for
+ * VLAN filtering if the underlying PF supports it.
+ *
+ * VIRTCHNL_VLAN_TOGGLE_ALLOWED - This field is used to say whether a
+ * certain VLAN capability can be toggled. For example if the underlying PF/CP
+ * allows the VF to toggle VLAN filtering, stripping, and/or insertion it should
+ * set this bit along with the supported ethertypes.
+ */
+enum virtchnl_vlan_support {
+	VIRTCHNL_VLAN_UNSUPPORTED =		0,
+	VIRTCHNL_VLAN_ETHERTYPE_8100 =		BIT(0),
+	VIRTCHNL_VLAN_ETHERTYPE_88A8 =		BIT(1),
+	VIRTCHNL_VLAN_ETHERTYPE_9100 =		BIT(2),
+	VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1 =	BIT(8),
+	VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 =	BIT(9),
+	VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 =	BIT(10),
+	VIRTCHNL_VLAN_PRIO =			BIT(24),
+	VIRTCHNL_VLAN_FILTER_MASK =		BIT(28),
+	VIRTCHNL_VLAN_ETHERTYPE_AND =		BIT(29),
+	VIRTCHNL_VLAN_ETHERTYPE_XOR =		BIT(30),
+	VIRTCHNL_VLAN_TOGGLE =			BIT(31),
+};
+
+/* This structure is used as part of the VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS
+ * for filtering, insertion, and stripping capabilities.
+ *
+ * If only outer capabilities are supported (for filtering, insertion, and/or
+ * stripping) then this refers to the outer most or single VLAN from the VF's
+ * perspective.
+ *
+ * If only inner capabilities are supported (for filtering, insertion, and/or
+ * stripping) then this refers to the outer most or single VLAN from the VF's
+ * perspective. Functionally this is the same as if only outer capabilities are
+ * supported. The VF driver is just forced to use the inner fields when
+ * adding/deleting filters and enabling/disabling offloads (if supported).
+ *
+ * If both outer and inner capabilities are supported (for filtering, insertion,
+ * and/or stripping) then outer refers to the outer most or single VLAN and
+ * inner refers to the second VLAN, if it exists, in the packet.
+ *
+ * There is no support for tunneled VLAN offloads, so outer or inner are never
+ * referring to a tunneled packet from the VF's perspective.
+ */
+struct virtchnl_vlan_supported_caps {
+	u32 outer;
+	u32 inner;
+};
+
+/* The PF populates these fields based on the supported VLAN filtering. If a
+ * field is VIRTCHNL_VLAN_UNSUPPORTED then it's not supported and the PF will
+ * reject any VIRTCHNL_OP_ADD_VLAN_V2 or VIRTCHNL_OP_DEL_VLAN_V2 messages using
+ * the unsupported fields.
+ *
+ * Also, a VF is only allowed to toggle its VLAN filtering setting if the
+ * VIRTCHNL_VLAN_TOGGLE bit is set.
+ *
+ * The ethertype(s) specified in the ethertype_init field are the ethertypes
+ * enabled for VLAN filtering. VLAN filtering in this case refers to the outer
+ * most VLAN from the VF's perspective. If both inner and outer filtering are
+ * allowed then ethertype_init only refers to the outer most VLAN as only
+ * VLAN ethertype supported for inner VLAN filtering is
+ * VIRTCHNL_VLAN_ETHERTYPE_8100. By default, inner VLAN filtering is disabled
+ * when both inner and outer filtering are allowed.
+ *
+ * The max_filters field tells the VF how many VLAN filters it's allowed to have
+ * at any one time. If it exceeds this amount and tries to add another filter,
+ * then the request will be rejected by the PF. To prevent failures, the VF
+ * should keep track of how many VLAN filters it has added and not attempt to
+ * add more than max_filters.
+ */
+struct virtchnl_vlan_filtering_caps {
+	struct virtchnl_vlan_supported_caps filtering_support;
+	u32 ethertype_init;
+	u16 max_filters;
+	u8 pad[2];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vlan_filtering_caps);
+
+/* This enum is used for the virtchnl_vlan_offload_caps structure to specify
+ * if the PF supports a different ethertype for stripping and insertion.
+ *
+ * VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION - The ethertype(s) specified
+ * for stripping affect the ethertype(s) specified for insertion and visa versa
+ * as well. If the VF tries to configure VLAN stripping via
+ * VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 with VIRTCHNL_VLAN_ETHERTYPE_8100 then
+ * that will be the ethertype for both stripping and insertion.
+ *
+ * VIRTCHNL_ETHERTYPE_MATCH_NOT_REQUIRED - The ethertype(s) specified for
+ * stripping do not affect the ethertype(s) specified for insertion and visa
+ * versa.
+ */
+enum virtchnl_vlan_ethertype_match {
+	VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION = 0,
+	VIRTCHNL_ETHERTYPE_MATCH_NOT_REQUIRED = 1,
+};
+
+/* The PF populates these fields based on the supported VLAN offloads. If a
+ * field is VIRTCHNL_VLAN_UNSUPPORTED then it's not supported and the PF will
+ * reject any VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 or
+ * VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 messages using the unsupported fields.
+ *
+ * Also, a VF is only allowed to toggle its VLAN offload setting if the
+ * VIRTCHNL_VLAN_TOGGLE_ALLOWED bit is set.
+ *
+ * The VF driver needs to be aware of how the tags are stripped by hardware and
+ * inserted by the VF driver based on the level of offload support. The PF will
+ * populate these fields based on where the VLAN tags are expected to be
+ * offloaded via the VIRTHCNL_VLAN_TAG_LOCATION_* bits. The VF will need to
+ * interpret these fields. See the definition of the
+ * VIRTCHNL_VLAN_TAG_LOCATION_* bits above the virtchnl_vlan_support
+ * enumeration.
+ */
+struct virtchnl_vlan_offload_caps {
+	struct virtchnl_vlan_supported_caps stripping_support;
+	struct virtchnl_vlan_supported_caps insertion_support;
+	u32 ethertype_init;
+	u8 ethertype_match;
+	u8 pad[3];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_vlan_offload_caps);
+
+/* VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS
+ * VF sends this message to determine its VLAN capabilities.
+ *
+ * PF will mark which capabilities it supports based on hardware support and
+ * current configuration. For example, if a port VLAN is configured the PF will
+ * not allow outer VLAN filtering, stripping, or insertion to be configured so
+ * it will block these features from the VF.
+ *
+ * The VF will need to cross reference its capabilities with the PFs
+ * capabilities in the response message from the PF to determine the VLAN
+ * support.
+ */
+struct virtchnl_vlan_caps {
+	struct virtchnl_vlan_filtering_caps filtering;
+	struct virtchnl_vlan_offload_caps offloads;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_vlan_caps);
+
+struct virtchnl_vlan {
+	u16 tci;	/* tci[15:13] = PCP and tci[11:0] = VID */
+	u16 tci_mask;	/* only valid if VIRTCHNL_VLAN_FILTER_MASK set in
+			 * filtering caps
+			 */
+	u16 tpid;	/* 0x8100, 0x88a8, etc. and only type(s) set in
+			 * filtering caps. Note that tpid here does not refer to
+			 * VIRTCHNL_VLAN_ETHERTYPE_*, but it refers to the
+			 * actual 2-byte VLAN TPID
+			 */
+	u8 pad[2];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_vlan);
+
+struct virtchnl_vlan_filter {
+	struct virtchnl_vlan inner;
+	struct virtchnl_vlan outer;
+	u8 pad[16];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(32, virtchnl_vlan_filter);
+
+/* VIRTCHNL_OP_ADD_VLAN_V2
+ * VIRTCHNL_OP_DEL_VLAN_V2
+ *
+ * VF sends these messages to add/del one or more VLAN tag filters for Rx
+ * traffic.
+ *
+ * The PF attempts to add the filters and returns status.
+ *
+ * The VF should only ever attempt to add/del virtchnl_vlan_filter(s) using the
+ * supported fields negotiated via VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS.
+ */
+struct virtchnl_vlan_filter_list_v2 {
+	u16 vport_id;
+	u16 num_elements;
+	u8 pad[4];
+	struct virtchnl_vlan_filter filters[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_vlan_filter_list_v2);
+
+/* VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2
+ * VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2
+ * VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2
+ * VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2
+ *
+ * VF sends this message to enable or disable VLAN stripping or insertion. It
+ * also needs to specify an ethertype. The VF knows which VLAN ethertypes are
+ * allowed and whether or not it's allowed to enable/disable the specific
+ * offload via the VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS message. The VF needs to
+ * parse the virtchnl_vlan_caps.offloads fields to determine which offload
+ * messages are allowed.
+ *
+ * For example, if the PF populates the virtchnl_vlan_caps.offloads in the
+ * following manner the VF will be allowed to enable and/or disable 0x8100 inner
+ * VLAN insertion and/or stripping via the opcodes listed above. Inner in this
+ * case means the outer most or single VLAN from the VF's perspective. This is
+ * because no outer offloads are supported. See the comments above the
+ * virtchnl_vlan_supported_caps structure for more details.
+ *
+ * virtchnl_vlan_caps.offloads.stripping_support.inner =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100;
+ *
+ * virtchnl_vlan_caps.offloads.insertion_support.inner =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100;
+ *
+ * In order to enable inner (again note that in this case inner is the outer
+ * most or single VLAN from the VF's perspective) VLAN stripping for 0x8100
+ * VLANs, the VF would populate the virtchnl_vlan_setting structure in the
+ * following manner and send the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 message.
+ *
+ * virtchnl_vlan_setting.inner_ethertype_setting =
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100;
+ *
+ * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on
+ * initialization.
+ *
+ * The reason that VLAN TPID(s) are not being used for the
+ * outer_ethertype_setting and inner_ethertype_setting fields is because it's
+ * possible a device could support VLAN insertion and/or stripping offload on
+ * multiple ethertypes concurrently, so this method allows a VF to request
+ * multiple ethertypes in one message using the virtchnl_vlan_support
+ * enumeration.
+ *
+ * For example, if the PF populates the virtchnl_vlan_caps.offloads in the
+ * following manner the VF will be allowed to enable 0x8100 and 0x88a8 outer
+ * VLAN insertion and stripping simultaneously. The
+ * virtchnl_vlan_caps.offloads.ethertype_match field will also have to be
+ * populated based on what the PF can support.
+ *
+ * virtchnl_vlan_caps.offloads.stripping_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * virtchnl_vlan_caps.offloads.insertion_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * In order to enable outer VLAN stripping for 0x8100 and 0x88a8 VLANs, the VF
+ * would populate the virthcnl_vlan_offload_structure in the following manner
+ * and send the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 message.
+ *
+ * virtchnl_vlan_setting.outer_ethertype_setting =
+ *			VIRTHCNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTHCNL_VLAN_ETHERTYPE_88A8;
+ *
+ * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on
+ * initialization.
+ *
+ * There is also the case where a PF and the underlying hardware can support
+ * VLAN offloads on multiple ethertypes, but not concurrently. For example, if
+ * the PF populates the virtchnl_vlan_caps.offloads in the following manner the
+ * VF will be allowed to enable and/or disable 0x8100 XOR 0x88a8 outer VLAN
+ * offloads. The ethertypes must match for stripping and insertion.
+ *
+ * virtchnl_vlan_caps.offloads.stripping_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_XOR;
+ *
+ * virtchnl_vlan_caps.offloads.insertion_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_XOR;
+ *
+ * virtchnl_vlan_caps.offloads.ethertype_match =
+ *			VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
+ *
+ * In order to enable outer VLAN stripping for 0x88a8 VLANs, the VF would
+ * populate the virtchnl_vlan_setting structure in the following manner and send
+ * the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2. Also, this will change the
+ * ethertype for VLAN insertion if it's enabled. So, for completeness, a
+ * VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 with the same ethertype should be sent.
+ *
+ * virtchnl_vlan_setting.outer_ethertype_setting = VIRTHCNL_VLAN_ETHERTYPE_88A8;
+ *
+ * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on
+ * initialization.
+ */
+struct virtchnl_vlan_setting {
+	u32 outer_ethertype_setting;
+	u32 inner_ethertype_setting;
+	u16 vport_id;
+	u8 pad[6];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vlan_setting);
+
 /* VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE
  * VF sends VSI id and flags.
  * PF returns status code in retval.
@@ -1156,6 +1509,30 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 	case VIRTCHNL_OP_DEL_FDIR_FILTER:
 		valid_len = sizeof(struct virtchnl_fdir_del);
 		break;
+	case VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS:
+		break;
+	case VIRTCHNL_OP_ADD_VLAN_V2:
+	case VIRTCHNL_OP_DEL_VLAN_V2:
+		valid_len = sizeof(struct virtchnl_vlan_filter_list_v2);
+		if (msglen >= valid_len) {
+			struct virtchnl_vlan_filter_list_v2 *vfl =
+			    (struct virtchnl_vlan_filter_list_v2 *)msg;
+
+			valid_len += (vfl->num_elements - 1) *
+				sizeof(struct virtchnl_vlan_filter);
+
+			if (vfl->num_elements == 0) {
+				err_msg_format = true;
+				break;
+			}
+		}
+		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2:
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2:
+	case VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2:
+	case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2:
+		valid_len = sizeof(struct virtchnl_vlan_setting);
+		break;
 	/* These are always errors coming from the VF. */
 	case VIRTCHNL_OP_EVENT:
 	case VIRTCHNL_OP_UNKNOWN:
-- 
cgit v1.2.3


From 877fee2a0c65a3b0b6ac0e90d7d7718b5a0341d3 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 17 Dec 2021 15:15:15 +0100
Subject: PCI: Convert pci_dev_present() stub to static inline

Change the pci_dev_present() stub which is used when CONFIG_PCI is not set
from a #define to a static inline stub.

Thix should fix clang -Werror builds failing due to errors like this:

  drivers/platform/x86/thinkpad_acpi.c:4475:35:
   error: unused variable 'fwbug_cards_ids' [-Werror,-Wunused-const-variable]

Where fwbug_cards_ids is an array of pci_device_id passed to
pci_dev_present() during a quirk check.

Link: https://lore.kernel.org/r/20211217141515.379586-1-hdegoede@redhat.com
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: platform-driver-x86@vger.kernel.org
---
 include/linux/pci.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 18a75c8e615c..7d825637d7ca 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1775,7 +1775,10 @@ static inline struct pci_dev *pci_get_class(unsigned int class,
 					    struct pci_dev *from)
 { return NULL; }
 
-#define pci_dev_present(ids)	(0)
+
+static inline int pci_dev_present(const struct pci_device_id *ids)
+{ return 0; }
+
 #define no_pci_devices()	(1)
 #define pci_dev_put(dev)	do { } while (0)
 
-- 
cgit v1.2.3


From 3849595866166b23bf6a0cb9ff87e06423167f67 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@nvidia.com>
Date: Tue, 14 Dec 2021 19:24:34 +0200
Subject: net/sched: flow_dissector: Fix matching on zone id for invalid conns

If ct rejects a flow, it removes the conntrack info from the skb.
act_ct sets the post_ct variable so the dissector will see this case
as an +tracked +invalid state, but the zone id is lost with the
conntrack info.

To restore the zone id on such cases, set the last executed zone,
via the tc control block, when passing ct, and read it back in the
dissector if there is no ct info on the skb (invalid connection).

Fixes: 7baf2429a1a9 ("net/sched: cls_flower add CT_FLAGS_INVALID flag support")
Signed-off-by: Paul Blakey <paulb@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c8cb7e697d47..2ecf8cfd2223 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1380,7 +1380,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb,
 		    struct flow_dissector *flow_dissector,
 		    void *target_container,
 		    u16 *ctinfo_map, size_t mapsize,
-		    bool post_ct);
+		    bool post_ct, u16 zone);
 void
 skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 			     struct flow_dissector *flow_dissector,
-- 
cgit v1.2.3


From 635d448a1cce4b4ebee52b351052c70434fa90ea Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@nvidia.com>
Date: Tue, 14 Dec 2021 19:24:35 +0200
Subject: net: openvswitch: Fix matching zone id for invalid conns arriving
 from tc

Zone id is not restored if we passed ct and ct rejected the connection,
as there is no ct info on the skb.

Save the zone from tc skb cb to tc skb extension and pass it on to
ovs, use that info to restore the zone id for invalid connections.

Fixes: d29334c15d33 ("net/sched: act_api: fix miss set post_ct for ovs after do conntrack in act_ct")
Signed-off-by: Paul Blakey <paulb@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2ecf8cfd2223..4507d77d6941 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -286,6 +286,7 @@ struct nf_bridge_info {
 struct tc_skb_ext {
 	__u32 chain;
 	__u16 mru;
+	__u16 zone;
 	bool post_ct;
 };
 #endif
-- 
cgit v1.2.3


From 6e478521df535b9d5ef5eb84d4352f235bbbef99 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 30 Jul 2021 09:56:05 -0400
Subject: iomap,xfs: Convert ->discard_page to ->discard_folio

XFS has the only implementation of ->discard_page today, so convert it
to use folios in the same patch as converting the API.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/iomap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 29491fb9c5ba..5ef5088dbbd8 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -285,7 +285,7 @@ struct iomap_writeback_ops {
 	 * Optional, allows the file system to discard state on a page where
 	 * we failed to submit any I/O.
 	 */
-	void (*discard_page)(struct page *page, loff_t fileoff);
+	void (*discard_folio)(struct folio *folio, loff_t pos);
 };
 
 struct iomap_writepage_ctx {
-- 
cgit v1.2.3


From e17f7a0bc4daa44a4809f5f2f947aa2aa74d1369 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Dec 2021 09:45:05 +0100
Subject: uio: remove copy_from_iter_flushcache() and copy_mc_to_iter()

These two wrappers are never used.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211215084508.435401-2-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/uio.h | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 6350354f97e9..494d552c1d66 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -196,7 +196,7 @@ bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 /*
  * Note, users like pmem that depend on the stricter semantics of
- * copy_from_iter_flushcache() than copy_from_iter_nocache() must check for
+ * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for
  * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
  * destination is flushed from the cache on return.
  */
@@ -211,24 +211,6 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
 #define _copy_mc_to_iter _copy_to_iter
 #endif
 
-static __always_inline __must_check
-size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
-{
-	if (unlikely(!check_copy_size(addr, bytes, false)))
-		return 0;
-	else
-		return _copy_from_iter_flushcache(addr, bytes, i);
-}
-
-static __always_inline __must_check
-size_t copy_mc_to_iter(void *addr, size_t bytes, struct iov_iter *i)
-{
-	if (unlikely(!check_copy_size(addr, bytes, true)))
-		return 0;
-	else
-		return _copy_mc_to_iter(addr, bytes, i);
-}
-
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
-- 
cgit v1.2.3


From fd1d00ec92002d8fe28ca981a72395eaa7ae3d11 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Dec 2021 09:45:06 +0100
Subject: dax: simplify dax_synchronous and set_dax_synchronous

Remove the pointless wrappers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Pankaj Gupta <pankaj.gupta@ionos.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/20211215084508.435401-3-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 87ae4c9b1d65..3bd1fdb5d5f4 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -48,16 +48,8 @@ void put_dax(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
 bool dax_write_cache_enabled(struct dax_device *dax_dev);
-bool __dax_synchronous(struct dax_device *dax_dev);
-static inline bool dax_synchronous(struct dax_device *dax_dev)
-{
-	return  __dax_synchronous(dax_dev);
-}
-void __set_dax_synchronous(struct dax_device *dax_dev);
-static inline void set_dax_synchronous(struct dax_device *dax_dev)
-{
-	__set_dax_synchronous(dax_dev);
-}
+bool dax_synchronous(struct dax_device *dax_dev);
+void set_dax_synchronous(struct dax_device *dax_dev);
 /*
  * Check if given mapping is supported by the file / underlying device.
  */
-- 
cgit v1.2.3


From 30c6828a17a572aeb9e3a3bacce05fdcf1106541 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Dec 2021 09:45:07 +0100
Subject: dax: remove the DAXDEV_F_SYNC flag

Remove the DAXDEV_F_SYNC flag and thus the flags argument to alloc_dax and
just let the drivers call set_dax_synchronous directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Pankaj Gupta <pankaj.gupta@ionos.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/20211215084508.435401-4-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 3bd1fdb5d5f4..c04f46478e3b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -6,9 +6,6 @@
 #include <linux/mm.h>
 #include <linux/radix-tree.h>
 
-/* Flag for synchronous flush */
-#define DAXDEV_F_SYNC (1UL << 0)
-
 typedef unsigned long dax_entry_t;
 
 struct dax_device;
@@ -42,8 +39,7 @@ struct dax_operations {
 };
 
 #if IS_ENABLED(CONFIG_DAX)
-struct dax_device *alloc_dax(void *private, const struct dax_operations *ops,
-		unsigned long flags);
+struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
 void put_dax(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
@@ -64,7 +60,7 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
 }
 #else
 static inline struct dax_device *alloc_dax(void *private,
-		const struct dax_operations *ops, unsigned long flags)
+		const struct dax_operations *ops)
 {
 	/*
 	 * Callers should check IS_ENABLED(CONFIG_DAX) to know if this
-- 
cgit v1.2.3


From 7ac5360cd4d02cc7e0eaf10867f599e041822f12 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Dec 2021 09:45:08 +0100
Subject: dax: remove the copy_from_iter and copy_to_iter methods

These methods indirect the actual DAX read/write path.  In the end pmem
uses magic flush and mc safe variants and fuse and dcssblk use plain ones
while device mapper picks redirects to the underlying device.

Add set_dax_nocache() and set_dax_nomc() APIs to control which copy
routines are used to remove indirect call from the read/write fast path
as well as a lot of boilerplate code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Vivek Goyal <vgoyal@redhat.com> [virtiofs]
Link: https://lore.kernel.org/r/20211215084508.435401-5-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h           | 9 +++------
 include/linux/device-mapper.h | 4 ----
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index c04f46478e3b..9fc5f99a0ae2 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -28,12 +28,6 @@ struct dax_operations {
 	 */
 	bool (*dax_supported)(struct dax_device *, struct block_device *, int,
 			sector_t, sector_t);
-	/* copy_from_iter: required operation for fs-dax direct-i/o */
-	size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
-			struct iov_iter *);
-	/* copy_to_iter: required operation for fs-dax direct-i/o */
-	size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
-			struct iov_iter *);
 	/* zero_page_range: required operation. Zero page range   */
 	int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
 };
@@ -95,6 +89,9 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
 }
 #endif
 
+void set_dax_nocache(struct dax_device *dax_dev);
+void set_dax_nomc(struct dax_device *dax_dev);
+
 struct writeback_control;
 #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a7df155ea49b..b26fecf6c8e8 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -147,8 +147,6 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
  */
 typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn);
-typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
-		void *addr, size_t bytes, struct iov_iter *i);
 typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff,
 		size_t nr_pages);
 
@@ -200,8 +198,6 @@ struct target_type {
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
 	dm_dax_direct_access_fn direct_access;
-	dm_dax_copy_iter_fn dax_copy_from_iter;
-	dm_dax_copy_iter_fn dax_copy_to_iter;
 	dm_dax_zero_page_range_fn dax_zero_page_range;
 
 	/* For internal device-mapper use. */
-- 
cgit v1.2.3


From d639b9d13a39cf15639cbe6e8b2c43eb60148a73 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:44 -0800
Subject: bpf: Introduce composable reg, ret and arg types.

There are some common properties shared between bpf reg, ret and arg
values. For instance, a value may be a NULL pointer, or a pointer to
a read-only memory. Previously, to express these properties, enumeration
was used. For example, in order to test whether a reg value can be NULL,
reg_type_may_be_null() simply enumerates all types that are possibly
NULL. The problem of this approach is that it's not scalable and causes
a lot of duplication. These properties can be combined, for example, a
type could be either MAYBE_NULL or RDONLY, or both.

This patch series rewrites the layout of reg_type, arg_type and
ret_type, so that common properties can be extracted and represented as
composable flag. For example, one can write

 ARG_PTR_TO_MEM | PTR_MAYBE_NULL

which is equivalent to the previous

 ARG_PTR_TO_MEM_OR_NULL

The type ARG_PTR_TO_MEM are called "base type" in this patch. Base
types can be extended with flags. A flag occupies the higher bits while
base types sits in the lower bits.

This patch in particular sets up a set of macro for this purpose. The
following patches will rewrite arg_types, ret_types and reg_types
respectively.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-2-haoluo@google.com
---
 include/linux/bpf.h          | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/bpf_verifier.h | 13 +++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 965fffaf0308..41bb3687cc85 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -297,6 +297,29 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
 
 extern const struct bpf_map_ops bpf_map_offload_ops;
 
+/* bpf_type_flag contains a set of flags that are applicable to the values of
+ * arg_type, ret_type and reg_type. For example, a pointer value may be null,
+ * or a memory is read-only. We classify types into two categories: base types
+ * and extended types. Extended types are base types combined with a type flag.
+ *
+ * Currently there are no more than 32 base types in arg_type, ret_type and
+ * reg_types.
+ */
+#define BPF_BASE_TYPE_BITS	8
+
+enum bpf_type_flag {
+	/* PTR may be NULL. */
+	PTR_MAYBE_NULL		= BIT(0 + BPF_BASE_TYPE_BITS),
+
+	__BPF_TYPE_LAST_FLAG	= PTR_MAYBE_NULL,
+};
+
+/* Max number of base types. */
+#define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
+
+/* Max number of all types. */
+#define BPF_TYPE_LIMIT		(__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1))
+
 /* function argument constraints */
 enum bpf_arg_type {
 	ARG_DONTCARE = 0,	/* unused argument in helper function */
@@ -343,7 +366,13 @@ enum bpf_arg_type {
 	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
 	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
 	__BPF_ARG_TYPE_MAX,
+
+	/* This must be the last entry. Its purpose is to ensure the enum is
+	 * wide enough to hold the higher bits reserved for bpf_type_flag.
+	 */
+	__BPF_ARG_TYPE_LIMIT	= BPF_TYPE_LIMIT,
 };
+static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
 
 /* type of values returned from helper functions */
 enum bpf_return_type {
@@ -359,7 +388,14 @@ enum bpf_return_type {
 	RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
 	RET_PTR_TO_MEM_OR_BTF_ID,	/* returns a pointer to a valid memory or a btf_id */
 	RET_PTR_TO_BTF_ID,		/* returns a pointer to a btf_id */
+	__BPF_RET_TYPE_MAX,
+
+	/* This must be the last entry. Its purpose is to ensure the enum is
+	 * wide enough to hold the higher bits reserved for bpf_type_flag.
+	 */
+	__BPF_RET_TYPE_LIMIT	= BPF_TYPE_LIMIT,
 };
+static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
  * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
@@ -461,7 +497,13 @@ enum bpf_reg_type {
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
 	PTR_TO_MAP_KEY,		 /* reg points to a map element key */
 	__BPF_REG_TYPE_MAX,
+
+	/* This must be the last entry. Its purpose is to ensure the enum is
+	 * wide enough to hold the higher bits reserved for bpf_type_flag.
+	 */
+	__BPF_REG_TYPE_LIMIT	= BPF_TYPE_LIMIT,
 };
+static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
 
 /* The information passed from prog-specific *_is_valid_access
  * back to the verifier.
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index ee931398f311..34e4ceaca3c7 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -546,5 +546,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    struct bpf_attach_target_info *tgt_info);
 void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab);
 
+#define BPF_BASE_TYPE_MASK	GENMASK(BPF_BASE_TYPE_BITS - 1, 0)
+
+/* extract base type from bpf_{arg, return, reg}_type. */
+static inline u32 base_type(u32 type)
+{
+	return type & BPF_BASE_TYPE_MASK;
+}
+
+/* extract flags from an extended type. See bpf_type_flag in bpf.h. */
+static inline u32 type_flag(u32 type)
+{
+	return type & ~BPF_BASE_TYPE_MASK;
+}
 
 #endif /* _LINUX_BPF_VERIFIER_H */
-- 
cgit v1.2.3


From 48946bd6a5d695c50b34546864b79c1f910a33c1 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:45 -0800
Subject: bpf: Replace ARG_XXX_OR_NULL with ARG_XXX | PTR_MAYBE_NULL

We have introduced a new type to make bpf_arg composable, by
reserving high bits of bpf_arg to represent flags of a type.

One of the flags is PTR_MAYBE_NULL which indicates a pointer
may be NULL. When applying this flag to an arg_type, it means
the arg can take NULL pointer. This patch switches the
qualified arg_types to use this flag. The arg_types changed
in this patch include:

1. ARG_PTR_TO_MAP_VALUE_OR_NULL
2. ARG_PTR_TO_MEM_OR_NULL
3. ARG_PTR_TO_CTX_OR_NULL
4. ARG_PTR_TO_SOCKET_OR_NULL
5. ARG_PTR_TO_ALLOC_MEM_OR_NULL
6. ARG_PTR_TO_STACK_OR_NULL

This patch does not eliminate the use of these arg_types, instead
it makes them an alias to the 'ARG_XXX | PTR_MAYBE_NULL'.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-3-haoluo@google.com
---
 include/linux/bpf.h | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 41bb3687cc85..765bd7cc4272 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -331,13 +331,11 @@ enum bpf_arg_type {
 	ARG_PTR_TO_MAP_KEY,	/* pointer to stack used as map key */
 	ARG_PTR_TO_MAP_VALUE,	/* pointer to stack used as map value */
 	ARG_PTR_TO_UNINIT_MAP_VALUE,	/* pointer to valid memory used to store a map value */
-	ARG_PTR_TO_MAP_VALUE_OR_NULL,	/* pointer to stack used as map value or NULL */
 
 	/* the following constraints used to prototype bpf_memcmp() and other
 	 * functions that access data on eBPF program stack
 	 */
 	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */
-	ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */
 	ARG_PTR_TO_UNINIT_MEM,	/* pointer to memory does not need to be initialized,
 				 * helper function must fill all bytes or clear
 				 * them in error case.
@@ -347,26 +345,31 @@ enum bpf_arg_type {
 	ARG_CONST_SIZE_OR_ZERO,	/* number of bytes accessed from memory or 0 */
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
-	ARG_PTR_TO_CTX_OR_NULL,	/* pointer to context or NULL */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
 	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
 	ARG_PTR_TO_SOCK_COMMON,	/* pointer to sock_common */
 	ARG_PTR_TO_INT,		/* pointer to int */
 	ARG_PTR_TO_LONG,	/* pointer to long */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */
-	ARG_PTR_TO_SOCKET_OR_NULL,	/* pointer to bpf_sock (fullsock) or NULL */
 	ARG_PTR_TO_BTF_ID,	/* pointer to in-kernel struct */
 	ARG_PTR_TO_ALLOC_MEM,	/* pointer to dynamically allocated memory */
-	ARG_PTR_TO_ALLOC_MEM_OR_NULL,	/* pointer to dynamically allocated memory or NULL */
 	ARG_CONST_ALLOC_SIZE_OR_ZERO,	/* number of allocated bytes requested */
 	ARG_PTR_TO_BTF_ID_SOCK_COMMON,	/* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
 	ARG_PTR_TO_PERCPU_BTF_ID,	/* pointer to in-kernel percpu type */
 	ARG_PTR_TO_FUNC,	/* pointer to a bpf program function */
-	ARG_PTR_TO_STACK_OR_NULL,	/* pointer to stack or NULL */
+	ARG_PTR_TO_STACK,	/* pointer to stack */
 	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
 	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
 	__BPF_ARG_TYPE_MAX,
 
+	/* Extended arg_types. */
+	ARG_PTR_TO_MAP_VALUE_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE,
+	ARG_PTR_TO_MEM_OR_NULL		= PTR_MAYBE_NULL | ARG_PTR_TO_MEM,
+	ARG_PTR_TO_CTX_OR_NULL		= PTR_MAYBE_NULL | ARG_PTR_TO_CTX,
+	ARG_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
+	ARG_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM,
+	ARG_PTR_TO_STACK_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_STACK,
+
 	/* This must be the last entry. Its purpose is to ensure the enum is
 	 * wide enough to hold the higher bits reserved for bpf_type_flag.
 	 */
-- 
cgit v1.2.3


From 3c4807322660d4290ac9062c034aed6b87243861 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:46 -0800
Subject: bpf: Replace RET_XXX_OR_NULL with RET_XXX | PTR_MAYBE_NULL

We have introduced a new type to make bpf_ret composable, by
reserving high bits to represent flags.

One of the flag is PTR_MAYBE_NULL, which indicates a pointer
may be NULL. When applying this flag to ret_types, it means
the returned value could be a NULL pointer. This patch
switches the qualified arg_types to use this flag.
The ret_types changed in this patch include:

1. RET_PTR_TO_MAP_VALUE_OR_NULL
2. RET_PTR_TO_SOCKET_OR_NULL
3. RET_PTR_TO_TCP_SOCK_OR_NULL
4. RET_PTR_TO_SOCK_COMMON_OR_NULL
5. RET_PTR_TO_ALLOC_MEM_OR_NULL
6. RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL
7. RET_PTR_TO_BTF_ID_OR_NULL

This patch doesn't eliminate the use of these names, instead
it makes them aliases to 'RET_PTR_TO_XXX | PTR_MAYBE_NULL'.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-4-haoluo@google.com
---
 include/linux/bpf.h | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 765bd7cc4272..975a1d5951bd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -382,17 +382,22 @@ enum bpf_return_type {
 	RET_INTEGER,			/* function returns integer */
 	RET_VOID,			/* function doesn't return anything */
 	RET_PTR_TO_MAP_VALUE,		/* returns a pointer to map elem value */
-	RET_PTR_TO_MAP_VALUE_OR_NULL,	/* returns a pointer to map elem value or NULL */
-	RET_PTR_TO_SOCKET_OR_NULL,	/* returns a pointer to a socket or NULL */
-	RET_PTR_TO_TCP_SOCK_OR_NULL,	/* returns a pointer to a tcp_sock or NULL */
-	RET_PTR_TO_SOCK_COMMON_OR_NULL,	/* returns a pointer to a sock_common or NULL */
-	RET_PTR_TO_ALLOC_MEM_OR_NULL,	/* returns a pointer to dynamically allocated memory or NULL */
-	RET_PTR_TO_BTF_ID_OR_NULL,	/* returns a pointer to a btf_id or NULL */
-	RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
+	RET_PTR_TO_SOCKET,		/* returns a pointer to a socket */
+	RET_PTR_TO_TCP_SOCK,		/* returns a pointer to a tcp_sock */
+	RET_PTR_TO_SOCK_COMMON,		/* returns a pointer to a sock_common */
+	RET_PTR_TO_ALLOC_MEM,		/* returns a pointer to dynamically allocated memory */
 	RET_PTR_TO_MEM_OR_BTF_ID,	/* returns a pointer to a valid memory or a btf_id */
 	RET_PTR_TO_BTF_ID,		/* returns a pointer to a btf_id */
 	__BPF_RET_TYPE_MAX,
 
+	/* Extended ret_types. */
+	RET_PTR_TO_MAP_VALUE_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE,
+	RET_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
+	RET_PTR_TO_TCP_SOCK_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
+	RET_PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
+	RET_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM,
+	RET_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
+
 	/* This must be the last entry. Its purpose is to ensure the enum is
 	 * wide enough to hold the higher bits reserved for bpf_type_flag.
 	 */
-- 
cgit v1.2.3


From c25b2ae136039ffa820c26138ed4a5e5f3ab3841 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:47 -0800
Subject: bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL

We have introduced a new type to make bpf_reg composable, by
allocating bits in the type to represent flags.

One of the flags is PTR_MAYBE_NULL which indicates a pointer
may be NULL. This patch switches the qualified reg_types to
use this flag. The reg_types changed in this patch include:

1. PTR_TO_MAP_VALUE_OR_NULL
2. PTR_TO_SOCKET_OR_NULL
3. PTR_TO_SOCK_COMMON_OR_NULL
4. PTR_TO_TCP_SOCK_OR_NULL
5. PTR_TO_BTF_ID_OR_NULL
6. PTR_TO_MEM_OR_NULL
7. PTR_TO_RDONLY_BUF_OR_NULL
8. PTR_TO_RDWR_BUF_OR_NULL

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/r/20211217003152.48334-5-haoluo@google.com
---
 include/linux/bpf.h          | 18 +++++++++---------
 include/linux/bpf_verifier.h |  4 ++++
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 975a1d5951bd..c3de62267b84 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -465,18 +465,15 @@ enum bpf_reg_type {
 	PTR_TO_CTX,		 /* reg points to bpf_context */
 	CONST_PTR_TO_MAP,	 /* reg points to struct bpf_map */
 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
-	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+	PTR_TO_MAP_KEY,		 /* reg points to a map element key */
 	PTR_TO_STACK,		 /* reg == frame_pointer + offset */
 	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 	PTR_TO_FLOW_KEYS,	 /* reg points to bpf_flow_keys */
 	PTR_TO_SOCKET,		 /* reg points to struct bpf_sock */
-	PTR_TO_SOCKET_OR_NULL,	 /* reg points to struct bpf_sock or NULL */
 	PTR_TO_SOCK_COMMON,	 /* reg points to sock_common */
-	PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
 	PTR_TO_TCP_SOCK,	 /* reg points to struct tcp_sock */
-	PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
 	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */
 	PTR_TO_XDP_SOCK,	 /* reg points to struct xdp_sock */
 	/* PTR_TO_BTF_ID points to a kernel struct that does not need
@@ -494,18 +491,21 @@ enum bpf_reg_type {
 	 * been checked for null. Used primarily to inform the verifier
 	 * an explicit null check is required for this struct.
 	 */
-	PTR_TO_BTF_ID_OR_NULL,
 	PTR_TO_MEM,		 /* reg points to valid memory region */
-	PTR_TO_MEM_OR_NULL,	 /* reg points to valid memory region or NULL */
 	PTR_TO_RDONLY_BUF,	 /* reg points to a readonly buffer */
-	PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */
 	PTR_TO_RDWR_BUF,	 /* reg points to a read/write buffer */
-	PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
-	PTR_TO_MAP_KEY,		 /* reg points to a map element key */
 	__BPF_REG_TYPE_MAX,
 
+	/* Extended reg_types. */
+	PTR_TO_MAP_VALUE_OR_NULL	= PTR_MAYBE_NULL | PTR_TO_MAP_VALUE,
+	PTR_TO_SOCKET_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_SOCKET,
+	PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
+	PTR_TO_TCP_SOCK_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,
+	PTR_TO_BTF_ID_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_BTF_ID,
+	PTR_TO_MEM_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_MEM,
+
 	/* This must be the last entry. Its purpose is to ensure the enum is
 	 * wide enough to hold the higher bits reserved for bpf_type_flag.
 	 */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 34e4ceaca3c7..143401d4c9d9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -18,6 +18,8 @@
  * that converting umax_value to int cannot overflow.
  */
 #define BPF_MAX_VAR_SIZ	(1 << 29)
+/* size of type_str_buf in bpf_verifier. */
+#define TYPE_STR_BUF_LEN 64
 
 /* Liveness marks, used for registers and spilled-regs (in stack slots).
  * Read marks propagate upwards until they find a write mark; they record that
@@ -484,6 +486,8 @@ struct bpf_verifier_env {
 	/* Same as scratched_regs but for stack slots */
 	u64 scratched_stack_slots;
 	u32 prev_log_len, prev_insn_print_len;
+	/* buffer used in reg_type_str() to generate reg_type string */
+	char type_str_buf[TYPE_STR_BUF_LEN];
 };
 
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
-- 
cgit v1.2.3


From 20b2aff4bc15bda809f994761d5719827d66c0b4 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:48 -0800
Subject: bpf: Introduce MEM_RDONLY flag

This patch introduce a flag MEM_RDONLY to tag a reg value
pointing to read-only memory. It makes the following changes:

1. PTR_TO_RDWR_BUF -> PTR_TO_BUF
2. PTR_TO_RDONLY_BUF -> PTR_TO_BUF | MEM_RDONLY

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-6-haoluo@google.com
---
 include/linux/bpf.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3de62267b84..126048110bdb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -311,7 +311,10 @@ enum bpf_type_flag {
 	/* PTR may be NULL. */
 	PTR_MAYBE_NULL		= BIT(0 + BPF_BASE_TYPE_BITS),
 
-	__BPF_TYPE_LAST_FLAG	= PTR_MAYBE_NULL,
+	/* MEM is read-only. */
+	MEM_RDONLY		= BIT(1 + BPF_BASE_TYPE_BITS),
+
+	__BPF_TYPE_LAST_FLAG	= MEM_RDONLY,
 };
 
 /* Max number of base types. */
@@ -492,8 +495,7 @@ enum bpf_reg_type {
 	 * an explicit null check is required for this struct.
 	 */
 	PTR_TO_MEM,		 /* reg points to valid memory region */
-	PTR_TO_RDONLY_BUF,	 /* reg points to a readonly buffer */
-	PTR_TO_RDWR_BUF,	 /* reg points to a read/write buffer */
+	PTR_TO_BUF,		 /* reg points to a read/write buffer */
 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
 	__BPF_REG_TYPE_MAX,
-- 
cgit v1.2.3


From cf9f2f8d62eca810afbd1ee6cc0800202b000e57 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:49 -0800
Subject: bpf: Convert PTR_TO_MEM_OR_NULL to composable types.

Remove PTR_TO_MEM_OR_NULL and replace it with PTR_TO_MEM combined with
flag PTR_MAYBE_NULL.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-7-haoluo@google.com
---
 include/linux/bpf.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 126048110bdb..567d83bf28f9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -506,7 +506,6 @@ enum bpf_reg_type {
 	PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
 	PTR_TO_TCP_SOCK_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,
 	PTR_TO_BTF_ID_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_BTF_ID,
-	PTR_TO_MEM_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_MEM,
 
 	/* This must be the last entry. Its purpose is to ensure the enum is
 	 * wide enough to hold the higher bits reserved for bpf_type_flag.
-- 
cgit v1.2.3


From 216e3cd2f28dbbf1fe86848e0e29e6693b9f0a20 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:51 -0800
Subject: bpf: Add MEM_RDONLY for helper args that are pointers to rdonly mem.

Some helper functions may modify its arguments, for example,
bpf_d_path, bpf_get_stack etc. Previously, their argument types
were marked as ARG_PTR_TO_MEM, which is compatible with read-only
mem types, such as PTR_TO_RDONLY_BUF. Therefore it's legitimate,
but technically incorrect, to modify a read-only memory by passing
it into one of such helper functions.

This patch tags the bpf_args compatible with immutable memory with
MEM_RDONLY flag. The arguments that don't have this flag will be
only compatible with mutable memory types, preventing the helper
from modifying a read-only memory. The bpf_args that have
MEM_RDONLY are compatible with both mutable memory and immutable
memory.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-9-haoluo@google.com
---
 include/linux/bpf.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 567d83bf28f9..26753139d5b4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -311,7 +311,9 @@ enum bpf_type_flag {
 	/* PTR may be NULL. */
 	PTR_MAYBE_NULL		= BIT(0 + BPF_BASE_TYPE_BITS),
 
-	/* MEM is read-only. */
+	/* MEM is read-only. When applied on bpf_arg, it indicates the arg is
+	 * compatible with both mutable and immutable memory.
+	 */
 	MEM_RDONLY		= BIT(1 + BPF_BASE_TYPE_BITS),
 
 	__BPF_TYPE_LAST_FLAG	= MEM_RDONLY,
-- 
cgit v1.2.3


From 8cbfe939abe905280279e84a297b1cb34e0d0ec9 Mon Sep 17 00:00:00 2001
From: Baowen Zheng <baowen.zheng@corigine.com>
Date: Fri, 17 Dec 2021 19:16:22 +0100
Subject: flow_offload: allow user to offload tc action to net device

Use flow_indr_dev_register/flow_indr_dev_setup_offload to
offload tc action.

We need to call tc_cleanup_flow_action to clean up tc action entry since
in tc_setup_action, some actions may hold dev refcnt, especially the mirror
action.

Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
Signed-off-by: Louis Peens <louis.peens@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a419718612c6..8b0bdeb4734e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -920,6 +920,7 @@ enum tc_setup_type {
 	TC_SETUP_QDISC_TBF,
 	TC_SETUP_QDISC_FIFO,
 	TC_SETUP_QDISC_HTB,
+	TC_SETUP_ACT,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
-- 
cgit v1.2.3


From 7d4203c13435c0bdae61bf16bbd0408d5b958ade Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 25 Nov 2021 18:15:37 +0100
Subject: mm: add virt_to_folio() and folio_address()

These two wrappers around their respective struct page variants will be
useful in the following patches.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
---
 include/linux/mm.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a7e4a9e7d807..4a6cf22483da 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -863,6 +863,13 @@ static inline struct page *virt_to_head_page(const void *x)
 	return compound_head(page);
 }
 
+static inline struct folio *virt_to_folio(const void *x)
+{
+	struct page *page = virt_to_page(x);
+
+	return page_folio(page);
+}
+
 void __put_page(struct page *page);
 
 void put_pages_list(struct list_head *pages);
@@ -1753,6 +1760,11 @@ void page_address_init(void);
 #define page_address_init()  do { } while(0)
 #endif
 
+static inline void *folio_address(const struct folio *folio)
+{
+	return page_address(&folio->page);
+}
+
 extern void *page_rmapping(struct page *page);
 extern struct anon_vma *page_anon_vma(struct page *page);
 extern pgoff_t __page_file_index(struct page *page);
-- 
cgit v1.2.3


From d5c383f2c98ac58c210b266cdaf7b86bc32d1ad1 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 17 Dec 2021 15:30:56 +0000
Subject: iommu/iova: Squash entry_dtor abstraction

All flush queues are driven by iommu-dma now, so there is no need to
abstract entry_dtor or its data any more. Squash the now-canonical
implementation directly into the IOVA code to get it out of the way.

Reviewed-by: John Garry <john.garry@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/2260f8de00ab5e0f9d2a1cf8978e6ae7cd4f182c.1639753638.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iova.h | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iova.h b/include/linux/iova.h
index 71d8a2de6635..e746d8e41449 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -40,9 +40,6 @@ struct iova_domain;
 /* Call-Back from IOVA code into IOMMU drivers */
 typedef void (* iova_flush_cb)(struct iova_domain *domain);
 
-/* Destructor for per-entry data */
-typedef void (* iova_entry_dtor)(unsigned long data);
-
 /* Number of entries per Flush Queue */
 #define IOVA_FQ_SIZE	256
 
@@ -53,7 +50,7 @@ typedef void (* iova_entry_dtor)(unsigned long data);
 struct iova_fq_entry {
 	unsigned long iova_pfn;
 	unsigned long pages;
-	unsigned long data;
+	struct page *freelist;
 	u64 counter; /* Flush counter when this entrie was added */
 };
 
@@ -88,9 +85,6 @@ struct iova_domain {
 	iova_flush_cb	flush_cb;	/* Call-Back function to flush IOMMU
 					   TLBs */
 
-	iova_entry_dtor entry_dtor;	/* IOMMU driver specific destructor for
-					   iova entry */
-
 	struct timer_list fq_timer;		/* Timer to regularily empty the
 						   flush-queues */
 	atomic_t fq_timer_on;			/* 1 when timer is active, 0
@@ -146,15 +140,14 @@ void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
 		    unsigned long size);
 void queue_iova(struct iova_domain *iovad,
 		unsigned long pfn, unsigned long pages,
-		unsigned long data);
+		struct page *freelist);
 unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 			      unsigned long limit_pfn, bool flush_rcache);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn);
-int init_iova_flush_queue(struct iova_domain *iovad,
-			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor);
+int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 #else
@@ -189,12 +182,6 @@ static inline void free_iova_fast(struct iova_domain *iovad,
 {
 }
 
-static inline void queue_iova(struct iova_domain *iovad,
-			      unsigned long pfn, unsigned long pages,
-			      unsigned long data)
-{
-}
-
 static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
 					    unsigned long size,
 					    unsigned long limit_pfn,
@@ -216,13 +203,6 @@ static inline void init_iova_domain(struct iova_domain *iovad,
 {
 }
 
-static inline int init_iova_flush_queue(struct iova_domain *iovad,
-					iova_flush_cb flush_cb,
-					iova_entry_dtor entry_dtor)
-{
-	return -ENODEV;
-}
-
 static inline struct iova *find_iova(struct iova_domain *iovad,
 				     unsigned long pfn)
 {
-- 
cgit v1.2.3


From 649ad9835a3783bcb6c69368fa939e0010abb2c6 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 17 Dec 2021 15:30:57 +0000
Subject: iommu/iova: Squash flush_cb abstraction

Once again, with iommu-dma now being the only flush queue user, we no
longer need the extra level of indirection through flush_cb. Squash that
and let the flush queue code call the domain method directly. This does
mean temporarily having to carry an additional copy of the IOMMU domain
pointer around instead, but only until a later patch untangles it again.

Reviewed-by: John Garry <john.garry@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/e3f9b4acdd6640012ef4fbc819ac868d727b64a9.1639753638.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iova.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iova.h b/include/linux/iova.h
index e746d8e41449..99be4fcea4f3 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -14,6 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/atomic.h>
 #include <linux/dma-mapping.h>
+#include <linux/iommu.h>
 
 /* iova structure */
 struct iova {
@@ -35,11 +36,6 @@ struct iova_rcache {
 	struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
-struct iova_domain;
-
-/* Call-Back from IOVA code into IOMMU drivers */
-typedef void (* iova_flush_cb)(struct iova_domain *domain);
-
 /* Number of entries per Flush Queue */
 #define IOVA_FQ_SIZE	256
 
@@ -82,8 +78,7 @@ struct iova_domain {
 	struct iova	anchor;		/* rbtree lookup anchor */
 	struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE];	/* IOVA range caches */
 
-	iova_flush_cb	flush_cb;	/* Call-Back function to flush IOMMU
-					   TLBs */
+	struct iommu_domain *fq_domain;
 
 	struct timer_list fq_timer;		/* Timer to regularily empty the
 						   flush-queues */
@@ -147,7 +142,7 @@ struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn);
-int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb);
+int init_iova_flush_queue(struct iova_domain *iovad, struct iommu_domain *fq_domain);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 #else
-- 
cgit v1.2.3


From 87f60cc65d24939353b40aa1d9297fea080cdf8d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Dec 2021 15:31:00 +0000
Subject: iommu/vt-d: Use put_pages_list

page->freelist is for the use of slab.  We already have the ability
to free a list of pages in the core mm, but it requires the use of a
list_head and for the pages to be chained together through page->lru.
Switch the Intel IOMMU and IOVA code over to using free_pages_list().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
[rm: split from original patch, cosmetic tweaks, fix fq entries]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/2115b560d9a0ce7cd4b948bd51a2b7bde8fdfd59.1639753638.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 3 ++-
 include/linux/iova.h  | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d2f3435e7d17..de0c57a567c8 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -186,7 +186,7 @@ struct iommu_iotlb_gather {
 	unsigned long		start;
 	unsigned long		end;
 	size_t			pgsize;
-	struct page		*freelist;
+	struct list_head	freelist;
 	bool			queued;
 };
 
@@ -399,6 +399,7 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 {
 	*gather = (struct iommu_iotlb_gather) {
 		.start	= ULONG_MAX,
+		.freelist = LIST_HEAD_INIT(gather->freelist),
 	};
 }
 
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 99be4fcea4f3..072a09c06e8a 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -46,7 +46,7 @@ struct iova_rcache {
 struct iova_fq_entry {
 	unsigned long iova_pfn;
 	unsigned long pages;
-	struct page *freelist;
+	struct list_head freelist;
 	u64 counter; /* Flush counter when this entrie was added */
 };
 
@@ -135,7 +135,7 @@ void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
 		    unsigned long size);
 void queue_iova(struct iova_domain *iovad,
 		unsigned long pfn, unsigned long pages,
-		struct page *freelist);
+		struct list_head *freelist);
 unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 			      unsigned long limit_pfn, bool flush_rcache);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
-- 
cgit v1.2.3


From a17e3026bc4da9135ca9a42ec0b1fa67f95172e3 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 17 Dec 2021 15:31:03 +0000
Subject: iommu: Move flush queue data into iommu_dma_cookie

Complete the move into iommu-dma by refactoring the flush queues
themselves to belong to the DMA cookie rather than the IOVA domain.

The refactoring may as well extend to some minor cosmetic aspects
too, to help us stay one step ahead of the style police.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/24304722005bc6f144e2a1fdd865d1465722fc2e.1639753638.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iova.h | 44 +-------------------------------------------
 1 file changed, 1 insertion(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iova.h b/include/linux/iova.h
index 072a09c06e8a..0abd48c5e622 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -12,9 +12,6 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/rbtree.h>
-#include <linux/atomic.h>
-#include <linux/dma-mapping.h>
-#include <linux/iommu.h>
 
 /* iova structure */
 struct iova {
@@ -36,27 +33,6 @@ struct iova_rcache {
 	struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
-/* Number of entries per Flush Queue */
-#define IOVA_FQ_SIZE	256
-
-/* Timeout (in ms) after which entries are flushed from the Flush-Queue */
-#define IOVA_FQ_TIMEOUT	10
-
-/* Flush Queue entry for defered flushing */
-struct iova_fq_entry {
-	unsigned long iova_pfn;
-	unsigned long pages;
-	struct list_head freelist;
-	u64 counter; /* Flush counter when this entrie was added */
-};
-
-/* Per-CPU Flush Queue structure */
-struct iova_fq {
-	struct iova_fq_entry entries[IOVA_FQ_SIZE];
-	unsigned head, tail;
-	spinlock_t lock;
-};
-
 /* holds all the iova translations for a domain */
 struct iova_domain {
 	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
@@ -67,23 +43,9 @@ struct iova_domain {
 	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
 	unsigned long	max32_alloc_size; /* Size of last failed allocation */
-	struct iova_fq __percpu *fq;	/* Flush Queue */
-
-	atomic64_t	fq_flush_start_cnt;	/* Number of TLB flushes that
-						   have been started */
-
-	atomic64_t	fq_flush_finish_cnt;	/* Number of TLB flushes that
-						   have been finished */
-
 	struct iova	anchor;		/* rbtree lookup anchor */
-	struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE];	/* IOVA range caches */
-
-	struct iommu_domain *fq_domain;
 
-	struct timer_list fq_timer;		/* Timer to regularily empty the
-						   flush-queues */
-	atomic_t fq_timer_on;			/* 1 when timer is active, 0
-						   when not */
+	struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE];	/* IOVA range caches */
 	struct hlist_node	cpuhp_dead;
 };
 
@@ -133,16 +95,12 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 	bool size_aligned);
 void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
 		    unsigned long size);
-void queue_iova(struct iova_domain *iovad,
-		unsigned long pfn, unsigned long pages,
-		struct list_head *freelist);
 unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 			      unsigned long limit_pfn, bool flush_rcache);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn);
-int init_iova_flush_queue(struct iova_domain *iovad, struct iommu_domain *fq_domain);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 #else
-- 
cgit v1.2.3


From 5bc9a9dd75351023793d8aa4116ead005d659729 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Sun, 19 Dec 2021 21:51:24 +0200
Subject: rfkill: allow to get the software rfkill state

iwlwifi needs to be able to differentiate between the
software rfkill state and the hardware rfkill state.

The reason for this is that iwlwifi needs to notify any
change in the software rfkill state even when it doesn't
own the device (which means even when the hardware rfkill
is asserted).

In order to be able to know the software rfkill when the
host does not own the device, iwlwifi needs to be able to
ask the state of the software rfkill ignoring the state
of the hardware rfkill.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Link: https://lore.kernel.org/r/20211219195124.125689-1-emmanuel.grumbach@intel.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 231e06b74b50..c35f3962dc4f 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -229,6 +229,13 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw);
  */
 bool rfkill_blocked(struct rfkill *rfkill);
 
+/**
+ * rfkill_soft_blocked - Query soft rfkill block state
+ *
+ * @rfkill: rfkill struct to query
+ */
+bool rfkill_soft_blocked(struct rfkill *rfkill);
+
 /**
  * rfkill_find_type - Helper for finding rfkill type by name
  * @name: the name of the type
-- 
cgit v1.2.3


From aade40b62745cf0b4e8a17d43652c5faff354e6b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 20 Dec 2021 13:34:48 +0100
Subject: iommu/iova: Temporarily include dma-mapping.h from iova.h

Some users of iova.h still expect that dma-mapping.h is also included.
Re-add the include until these users are updated to fix compile
failures in the iommu tree.

Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20211220123448.19996-1-joro@8bytes.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iova.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/iova.h b/include/linux/iova.h
index 0abd48c5e622..cea79cb9f26c 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/rbtree.h>
+#include <linux/dma-mapping.h>
 
 /* iova structure */
 struct iova {
-- 
cgit v1.2.3


From 59f37b7370ef56e6faf25d0e18bc597a0af40bb8 Mon Sep 17 00:00:00 2001
From: Sam Protsenko <semen.protsenko@linaro.org>
Date: Sat, 4 Dec 2021 21:57:55 +0200
Subject: tty: serial: samsung: Remove USI initialization

USI control is now extracted to the dedicated USI driver. Remove USI
related code from serial driver to avoid conflicts and code duplication.

Signed-off-by: Sam Protsenko <semen.protsenko@linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Link: https://lore.kernel.org/r/20211204195757.8600-4-semen.protsenko@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_s3c.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h
index cf0de4a86640..f6c3323fc4c5 100644
--- a/include/linux/serial_s3c.h
+++ b/include/linux/serial_s3c.h
@@ -27,15 +27,6 @@
 #define S3C2410_UERSTAT	  (0x14)
 #define S3C2410_UFSTAT	  (0x18)
 #define S3C2410_UMSTAT	  (0x1C)
-#define USI_CON		  (0xC4)
-#define USI_OPTION	  (0xC8)
-
-#define USI_CON_RESET			(1<<0)
-#define USI_CON_RESET_MASK		(1<<0)
-
-#define USI_OPTION_HWACG_CLKREQ_ON	(1<<1)
-#define USI_OPTION_HWACG_CLKSTOP_ON	(1<<2)
-#define USI_OPTION_HWACG_MASK		(3<<1)
 
 #define S3C2410_LCON_CFGMASK	  ((0xF<<3)|(0x3))
 
-- 
cgit v1.2.3


From 1a5e91d8375fc8369207cc0b9894a324f2bbf1d9 Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Mon, 13 Dec 2021 02:14:02 -0500
Subject: swiotlb: Add swiotlb bounce buffer remap function for HV IVM

In Isolation VM with AMD SEV, bounce buffer needs to be accessed via
extra address space which is above shared_gpa_boundary (E.G 39 bit
address line) reported by Hyper-V CPUID ISOLATION_CONFIG. The access
physical address will be original physical address + shared_gpa_boundary.
The shared_gpa_boundary in the AMD SEV SNP spec is called virtual top of
memory(vTOM). Memory addresses below vTOM are automatically treated as
private while memory above vTOM is treated as shared.

Expose swiotlb_unencrypted_base for platforms to set unencrypted
memory base offset and platform calls swiotlb_update_mem_attributes()
to remap swiotlb mem to unencrypted address space. memremap() can
not be called in the early stage and so put remapping code into
swiotlb_update_mem_attributes(). Store remap address and use it to copy
data from/to swiotlb bounce buffer.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20211213071407.314309-2-ltykernel@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/linux/swiotlb.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 569272871375..f6c3638255d5 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -73,6 +73,9 @@ extern enum swiotlb_force swiotlb_force;
  * @end:	The end address of the swiotlb memory pool. Used to do a quick
  *		range check to see if the memory was in fact allocated by this
  *		API.
+ * @vaddr:	The vaddr of the swiotlb memory pool. The swiotlb memory pool
+ *		may be remapped in the memory encrypted case and store virtual
+ *		address for bounce buffer operation.
  * @nslabs:	The number of IO TLB blocks (in groups of 64) between @start and
  *		@end. For default swiotlb, this is command line adjustable via
  *		setup_io_tlb_npages.
@@ -92,6 +95,7 @@ extern enum swiotlb_force swiotlb_force;
 struct io_tlb_mem {
 	phys_addr_t start;
 	phys_addr_t end;
+	void *vaddr;
 	unsigned long nslabs;
 	unsigned long used;
 	unsigned int index;
@@ -186,4 +190,6 @@ static inline bool is_swiotlb_for_alloc(struct device *dev)
 }
 #endif /* CONFIG_DMA_RESTRICTED_POOL */
 
+extern phys_addr_t swiotlb_unencrypted_base;
+
 #endif /* __LINUX_SWIOTLB_H */
-- 
cgit v1.2.3


From 743b237c3a7b0f5b44aa704aae8a1058877b6322 Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Mon, 13 Dec 2021 02:14:05 -0500
Subject: scsi: storvsc: Add Isolation VM support for storvsc driver

In Isolation VM, all shared memory with host needs to mark visible
to host via hvcall. vmbus_establish_gpadl() has already done it for
storvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
mpb_desc() still needs to be handled. Use DMA API(scsi_dma_map/unmap)
to map these memory during sending/receiving packet and return swiotlb
bounce buffer dma address. In Isolation VM, swiotlb  bounce buffer is
marked to be visible to host and the swiotlb force mode is enabled.

Set device's dma min align mask to HV_HYP_PAGE_SIZE - 1 in order to
keep the original data offset in the bounce buffer.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Reviewed-by: Long Li <longli@microsoft.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20211213071407.314309-5-ltykernel@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/linux/hyperv.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index b823311eac79..650a0574b746 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1261,6 +1261,7 @@ struct hv_device {
 
 	struct vmbus_channel *channel;
 	struct kset	     *channels_kset;
+	struct device_dma_parameters dma_parms;
 
 	/* place holder to keep track of the dir for hv device in debugfs */
 	struct dentry *debug_dir;
-- 
cgit v1.2.3


From 846da38de0e8224f2f94b885125cf1fd2d7b0d39 Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Mon, 13 Dec 2021 02:14:06 -0500
Subject: net: netvsc: Add Isolation VM support for netvsc driver

In Isolation VM, all shared memory with host needs to mark visible
to host via hvcall. vmbus_establish_gpadl() has already done it for
netvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
pagebuffer() stills need to be handled. Use DMA API to map/umap
these memory during sending/receiving packet and Hyper-V swiotlb
bounce buffer dma address will be returned. The swiotlb bounce buffer
has been masked to be visible to host during boot up.

rx/tx ring buffer is allocated via vzalloc() and they need to be
mapped into unencrypted address space(above vTOM) before sharing
with host and accessing. Add hv_map/unmap_memory() to map/umap rx
/tx ring buffer.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20211213071407.314309-6-ltykernel@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/linux/hyperv.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 650a0574b746..f565a8938836 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1584,6 +1584,11 @@ struct hyperv_service_callback {
 	void (*callback)(void *context);
 };
 
+struct hv_dma_range {
+	dma_addr_t dma;
+	u32 mapping_size;
+};
+
 #define MAX_SRV_VER	0x7ffffff
 extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen,
 				const int *fw_version, int fw_vercnt,
-- 
cgit v1.2.3


From 6fc61c39ee1adb5f4115d288c876772fcd8b6979 Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@somainline.org>
Date: Sun, 21 Nov 2021 01:20:46 +0100
Subject: soc: qcom: llcc: Add configuration data for SM8350

Add LLCC configuration data for SM8350 SoC.

Signed-off-by: Konrad Dybcio <konrad.dybcio@somainline.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20211121002050.36977-2-konrad.dybcio@somainline.org
---
 include/linux/soc/qcom/llcc-qcom.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 437c9df13229..9e8fd92c96b7 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -33,6 +33,9 @@
 #define LLCC_MODPE       29
 #define LLCC_APTCM       30
 #define LLCC_WRCACHE     31
+#define LLCC_CVPFW       32
+#define LLCC_CPUSS1      33
+#define LLCC_CPUHWT      36
 
 /**
  * struct llcc_slice_desc - Cache slice descriptor
-- 
cgit v1.2.3


From 7e5cced9ca84df52d874aca6b632f930b3dc5bc6 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 20 Dec 2021 09:49:01 -0500
Subject: net: accept UFOv6 packages in virtio_net_hdr_to_skb

Skb with skb->protocol 0 at the time of virtio_net_hdr_to_skb may have
a protocol inferred from virtio_net_hdr with virtio_net_hdr_set_proto.

Unlike TCP, UDP does not have separate types for IPv4 and IPv6. Type
VIRTIO_NET_HDR_GSO_UDP is guessed to be IPv4/UDP. As of the below
commit, UFOv6 packets are dropped due to not matching the protocol as
obtained from dev_parse_header_protocol.

Invert the test to take that L2 protocol field as starting point and
pass both UFOv4 and UFOv6 for VIRTIO_NET_HDR_GSO_UDP.

Fixes: 924a9bc362a5 ("net: check if protocol extracted by virtio_net_hdr_set_proto is correct")
Link: https://lore.kernel.org/netdev/CABcq3pG9GRCYqFDBAJ48H1vpnnX=41u+MhQnayF1ztLH4WX0Fw@mail.gmail.com/
Reported-by: Andrew Melnichenko <andrew@daynix.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20211220144901.2784030-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/virtio_net.h | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 04e87f4b9417..22dd48c82560 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -7,6 +7,21 @@
 #include <uapi/linux/udp.h>
 #include <uapi/linux/virtio_net.h>
 
+static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type)
+{
+	switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+	case VIRTIO_NET_HDR_GSO_TCPV4:
+		return protocol == cpu_to_be16(ETH_P_IP);
+	case VIRTIO_NET_HDR_GSO_TCPV6:
+		return protocol == cpu_to_be16(ETH_P_IPV6);
+	case VIRTIO_NET_HDR_GSO_UDP:
+		return protocol == cpu_to_be16(ETH_P_IP) ||
+		       protocol == cpu_to_be16(ETH_P_IPV6);
+	default:
+		return false;
+	}
+}
+
 static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
 					   const struct virtio_net_hdr *hdr)
 {
@@ -88,9 +103,12 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 			if (!skb->protocol) {
 				__be16 protocol = dev_parse_header_protocol(skb);
 
-				virtio_net_hdr_set_proto(skb, hdr);
-				if (protocol && protocol != skb->protocol)
+				if (!protocol)
+					virtio_net_hdr_set_proto(skb, hdr);
+				else if (!virtio_net_hdr_match_proto(protocol, hdr->gso_type))
 					return -EINVAL;
+				else
+					skb->protocol = protocol;
 			}
 retry:
 			if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
-- 
cgit v1.2.3


From 1ed1d592113959f00cc552c3b9f47ca2d157768f Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 20 Dec 2021 09:50:27 -0500
Subject: net: skip virtio_net_hdr_set_proto if protocol already set

virtio_net_hdr_set_proto infers skb->protocol from the virtio_net_hdr
gso_type, to avoid packets getting dropped for lack of a proto type.

Its protocol choice is a guess, especially in the case of UFO, where
the single VIRTIO_NET_HDR_GSO_UDP label covers both UFOv4 and UFOv6.

Skip this best effort if the field is already initialized. Whether
explicitly from userspace, or implicitly based on an earlier call to
dev_parse_header_protocol (which is more robust, but was introduced
after this patch).

Fixes: 9d2f67e43b73 ("net/packet: fix packet drop as of virtio gso")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20211220145027.2784293-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/virtio_net.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 22dd48c82560..a960de68ac69 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -25,6 +25,9 @@ static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type)
 static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
 					   const struct virtio_net_hdr *hdr)
 {
+	if (skb->protocol)
+		return 0;
+
 	switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
 	case VIRTIO_NET_HDR_GSO_TCPV4:
 	case VIRTIO_NET_HDR_GSO_UDP:
-- 
cgit v1.2.3


From 80a5ca99c5c04be6777df225ab932142a9d60c3f Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Thu, 16 Dec 2021 11:33:00 +0800
Subject: rapidio: remove not used macro definition in rio_ids.h

The definition of RIO_VID_FREESCALE, RIO_DID_MPC8560, RIO_DID_TSI500,
RIO_DID_TSI576 and RIO_DID_TSI721 are not used for many years in the
current code, so just remove them.

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Link: https://lore.kernel.org/r/1639625581-22867-2-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/rio_ids.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rio_ids.h b/include/linux/rio_ids.h
index 4846f72759b2..e74d8840708a 100644
--- a/include/linux/rio_ids.h
+++ b/include/linux/rio_ids.h
@@ -9,15 +9,10 @@
 #ifndef LINUX_RIO_IDS_H
 #define LINUX_RIO_IDS_H
 
-#define RIO_VID_FREESCALE		0x0002
-#define RIO_DID_MPC8560			0x0003
-
 #define RIO_VID_TUNDRA			0x000d
-#define RIO_DID_TSI500			0x0500
 #define RIO_DID_TSI568			0x0568
 #define RIO_DID_TSI572			0x0572
 #define RIO_DID_TSI574			0x0574
-#define RIO_DID_TSI576			0x0578 /* Same ID as Tsi578 */
 #define RIO_DID_TSI577			0x0577
 #define RIO_DID_TSI578			0x0578
 
@@ -33,7 +28,6 @@
 #define RIO_DID_IDTCPS1616		0x0379
 #define RIO_DID_IDTVPS1616		0x0377
 #define RIO_DID_IDTSPS1616		0x0378
-#define RIO_DID_TSI721			0x80ab
 #define RIO_DID_IDTRXS1632		0x80e5
 #define RIO_DID_IDTRXS2448		0x80e6
 
-- 
cgit v1.2.3


From 612d4904191ff9aca01b1e087d8687b3a223cb33 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Thu, 16 Dec 2021 11:33:01 +0800
Subject: rapidio: remove not used code about RIO_VID_TUNDRA

According to https://rapidio.org/vendor-id/, there is no 0x000d vendor id
in the complete and current list of VendorIDs, it means that the related
code is dead code now, so just remove them.

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Link: https://lore.kernel.org/r/1639625581-22867-3-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/rio_ids.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rio_ids.h b/include/linux/rio_ids.h
index e74d8840708a..c7e2f21dd5c1 100644
--- a/include/linux/rio_ids.h
+++ b/include/linux/rio_ids.h
@@ -9,13 +9,6 @@
 #ifndef LINUX_RIO_IDS_H
 #define LINUX_RIO_IDS_H
 
-#define RIO_VID_TUNDRA			0x000d
-#define RIO_DID_TSI568			0x0568
-#define RIO_DID_TSI572			0x0572
-#define RIO_DID_TSI574			0x0574
-#define RIO_DID_TSI577			0x0577
-#define RIO_DID_TSI578			0x0578
-
 #define RIO_VID_IDT			0x0038
 #define RIO_DID_IDT70K200		0x0310
 #define RIO_DID_IDTCPS8			0x035c
-- 
cgit v1.2.3


From 0032ca576a79946492194ae4860b462d32815c66 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng01@gmail.com>
Date: Tue, 21 Dec 2021 17:16:46 +0900
Subject: counter: Add the necessary colons and indents to the comments of
 counter_compi

Since commit aaec1a0f76ec ("counter: Internalize sysfs interface code")
introduce a warning as:

linux-next/Documentation/driver-api/generic-counter:234: ./include/linux/counter.h:43: WARNING: Unexpected indentation.
linux-next/Documentation/driver-api/generic-counter:234: ./include/linux/counter.h:45: WARNING: Block quote ends without a blank line; unexpected unindent.

Add the necessary colons and indents.

Fixes: aaec1a0f76ec ("counter: Internalize sysfs interface code")
Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Link: https://lore.kernel.org/r/26011e814d6eca02c7ebdbb92f171a49928a7e89.1640072891.git.vilhelm.gray@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/counter.h | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index b7d0a00a61cf..dfbde2808998 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -38,64 +38,64 @@ enum counter_comp_type {
  * @type:		Counter component data type
  * @name:		device-specific component name
  * @priv:		component-relevant data
- * @action_read		Synapse action mode read callback. The read value of the
+ * @action_read:		Synapse action mode read callback. The read value of the
  *			respective Synapse action mode should be passed back via
  *			the action parameter.
- * @device_u8_read	Device u8 component read callback. The read value of the
+ * @device_u8_read:		Device u8 component read callback. The read value of the
  *			respective Device u8 component should be passed back via
  *			the val parameter.
- * @count_u8_read	Count u8 component read callback. The read value of the
+ * @count_u8_read:		Count u8 component read callback. The read value of the
  *			respective Count u8 component should be passed back via
  *			the val parameter.
- * @signal_u8_read	Signal u8 component read callback. The read value of the
+ * @signal_u8_read:		Signal u8 component read callback. The read value of the
  *			respective Signal u8 component should be passed back via
  *			the val parameter.
- * @device_u32_read	Device u32 component read callback. The read value of
+ * @device_u32_read:		Device u32 component read callback. The read value of
  *			the respective Device u32 component should be passed
  *			back via the val parameter.
- * @count_u32_read	Count u32 component read callback. The read value of the
+ * @count_u32_read:		Count u32 component read callback. The read value of the
  *			respective Count u32 component should be passed back via
  *			the val parameter.
- * @signal_u32_read	Signal u32 component read callback. The read value of
+ * @signal_u32_read:		Signal u32 component read callback. The read value of
  *			the respective Signal u32 component should be passed
  *			back via the val parameter.
- * @device_u64_read	Device u64 component read callback. The read value of
+ * @device_u64_read:		Device u64 component read callback. The read value of
  *			the respective Device u64 component should be passed
  *			back via the val parameter.
- * @count_u64_read	Count u64 component read callback. The read value of the
+ * @count_u64_read:		Count u64 component read callback. The read value of the
  *			respective Count u64 component should be passed back via
  *			the val parameter.
- * @signal_u64_read	Signal u64 component read callback. The read value of
+ * @signal_u64_read:		Signal u64 component read callback. The read value of
  *			the respective Signal u64 component should be passed
  *			back via the val parameter.
- * @action_write	Synapse action mode write callback. The write value of
+ * @action_write:		Synapse action mode write callback. The write value of
  *			the respective Synapse action mode is passed via the
  *			action parameter.
- * @device_u8_write	Device u8 component write callback. The write value of
+ * @device_u8_write:		Device u8 component write callback. The write value of
  *			the respective Device u8 component is passed via the val
  *			parameter.
- * @count_u8_write	Count u8 component write callback. The write value of
+ * @count_u8_write:		Count u8 component write callback. The write value of
  *			the respective Count u8 component is passed via the val
  *			parameter.
- * @signal_u8_write	Signal u8 component write callback. The write value of
+ * @signal_u8_write:		Signal u8 component write callback. The write value of
  *			the respective Signal u8 component is passed via the val
  *			parameter.
- * @device_u32_write	Device u32 component write callback. The write value of
+ * @device_u32_write:		Device u32 component write callback. The write value of
  *			the respective Device u32 component is passed via the
  *			val parameter.
- * @count_u32_write	Count u32 component write callback. The write value of
+ * @count_u32_write:		Count u32 component write callback. The write value of
  *			the respective Count u32 component is passed via the val
  *			parameter.
- * @signal_u32_write	Signal u32 component write callback. The write value of
+ * @signal_u32_write:		Signal u32 component write callback. The write value of
  *			the respective Signal u32 component is passed via the
  *			val parameter.
- * @device_u64_write	Device u64 component write callback. The write value of
+ * @device_u64_write:		Device u64 component write callback. The write value of
  *			the respective Device u64 component is passed via the
  *			val parameter.
- * @count_u64_write	Count u64 component write callback. The write value of
+ * @count_u64_write:		Count u64 component write callback. The write value of
  *			the respective Count u64 component is passed via the val
  *			parameter.
- * @signal_u64_write	Signal u64 component write callback. The write value of
+ * @signal_u64_write:		Signal u64 component write callback. The write value of
  *			the respective Signal u64 component is passed via the
  *			val parameter.
  */
-- 
cgit v1.2.3


From 79f1c7304295bbbc611bc53cfd5425b777b3e840 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 9 Dec 2021 14:30:08 +0200
Subject: kernfs: Replace kernel.h with the necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20211209123008.3391-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kernfs.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 9f650986a81b..861c4f0f8a29 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -6,7 +6,6 @@
 #ifndef __LINUX_KERNFS_H
 #define __LINUX_KERNFS_H
 
-#include <linux/kernel.h>
 #include <linux/err.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
@@ -14,6 +13,8 @@
 #include <linux/lockdep.h>
 #include <linux/rbtree.h>
 #include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/types.h>
 #include <linux/uidgid.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
@@ -23,6 +24,7 @@ struct dentry;
 struct iattr;
 struct seq_file;
 struct vm_area_struct;
+struct vm_operations_struct;
 struct super_block;
 struct file_system_type;
 struct poll_table_struct;
-- 
cgit v1.2.3


From eca6e2d4a4a4b824f055eeaaa24f1c2327fb91a2 Mon Sep 17 00:00:00 2001
From: Anand Ashok Dumbre <anand.ashok.dumbre@xilinx.com>
Date: Fri, 3 Dec 2021 21:23:54 +0000
Subject: device property: Add fwnode_iomap()

This patch introduces a new helper routine - fwnode_iomap(), which
allows to map the memory mapped IO for a given device node.

This implementation does not cover the ACPI case and may be expanded
in the future. The main purpose here is to be able to develop resource
provider agnostic drivers.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Anand Ashok Dumbre <anand.ashok.dumbre@xilinx.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20211203212358.31444-2-anand.ashok.dumbre@xilinx.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/property.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 88fa726a76df..6670d5a1ec2a 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -122,6 +122,8 @@ void fwnode_handle_put(struct fwnode_handle *fwnode);
 
 int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index);
 
+void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index);
+
 unsigned int device_get_child_node_count(struct device *dev);
 
 static inline bool device_property_read_bool(struct device *dev,
-- 
cgit v1.2.3


From 1b0b6cc8030d08d2a24e9e5f85dc36c5a58200ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Wed, 24 Nov 2021 00:27:01 +0100
Subject: power: supply: add charge_behaviour attributes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This a revised version of
"[RFC] add standardized attributes for force_discharge and inhibit_charge" [0],
incorporating discussion results.

The biggest change is the switch from two boolean attributes to a single
enum attribute.

[0] https://lore.kernel.org/platform-driver-x86/21569a89-8303-8573-05fb-c2fec29983d1@gmail.com/

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Link: https://lore.kernel.org/r/20211123232704.25394-2-linux@weissschuh.net
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/power_supply.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 9ca1f120a211..70c333e86293 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -132,6 +132,7 @@ enum power_supply_property {
 	POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX,
 	POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD, /* in percents! */
 	POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD, /* in percents! */
+	POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR,
 	POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT,
 	POWER_SUPPLY_PROP_INPUT_VOLTAGE_LIMIT,
 	POWER_SUPPLY_PROP_INPUT_POWER_LIMIT,
@@ -202,6 +203,12 @@ enum power_supply_usb_type {
 	POWER_SUPPLY_USB_TYPE_APPLE_BRICK_ID,	/* Apple Charging Method */
 };
 
+enum power_supply_charge_behaviour {
+	POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO = 0,
+	POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE,
+	POWER_SUPPLY_CHARGE_BEHAVIOUR_FORCE_DISCHARGE,
+};
+
 enum power_supply_notifier_events {
 	PSY_EVENT_PROP_CHANGED,
 };
-- 
cgit v1.2.3


From 539b9c94ac83563842a27e8cc3de5164b15c4de0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Wed, 24 Nov 2021 00:27:02 +0100
Subject: power: supply: add helpers for charge_behaviour sysfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These helper functions can be used by drivers to implement their own
sysfs-attributes.
This is useful for ACPI-drivers extending the default ACPI-battery with
their own charge_behaviour attributes.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Link: https://lore.kernel.org/r/20211123232704.25394-3-linux@weissschuh.net
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/power_supply.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 70c333e86293..71f0379c2af8 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -546,4 +546,13 @@ static inline
 void power_supply_remove_hwmon_sysfs(struct power_supply *psy) {}
 #endif
 
+#ifdef CONFIG_SYSFS
+ssize_t power_supply_charge_behaviour_show(struct device *dev,
+					   unsigned int available_behaviours,
+					   enum power_supply_charge_behaviour behaviour,
+					   char *buf);
+
+int power_supply_charge_behaviour_parse(unsigned int available_behaviours, const char *buf);
+#endif
+
 #endif /* __LINUX_POWER_SUPPLY_H__ */
-- 
cgit v1.2.3


From 37ae5a0f5287a52cf51242e76ccf198d02ffe495 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Sat, 18 Dec 2021 18:41:56 +0900
Subject: block: use "unsigned long" for blk_validate_block_size().

Since lo_simple_ioctl(LOOP_SET_BLOCK_SIZE) and ioctl(NBD_SET_BLKSIZE) pass
user-controlled "unsigned long arg" to blk_validate_block_size(),
"unsigned long" should be used for validation.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/9ecbf057-4375-c2db-ab53-e4cc0dff953d@i-love.sakura.ne.jp
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c80cfaefc0a8..bb5fb7282e6e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -45,7 +45,7 @@ struct blk_crypto_profile;
  */
 #define BLKCG_MAX_POLS		6
 
-static inline int blk_validate_block_size(unsigned int bsize)
+static inline int blk_validate_block_size(unsigned long bsize)
 {
 	if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
 		return -EINVAL;
-- 
cgit v1.2.3


From dcce50e6cc4d86a63dc0a9a6ee7d4f948ccd53a1 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 8 Nov 2021 14:35:59 -0800
Subject: compiler.h: Fix annotation macro misplacement with Clang

When building with Clang and CONFIG_TRACE_BRANCH_PROFILING, there are a
lot of unreachable warnings, like:

  arch/x86/kernel/traps.o: warning: objtool: handle_xfd_event()+0x134: unreachable instruction

Without an input to the inline asm, 'volatile' is ignored for some
reason and Clang feels free to move the reachable() annotation away from
its intended location.

Fix that by re-adding the counter value to the inputs.

Fixes: f1069a8756b9 ("compiler.h: Avoid using inline asm operand modifiers")
Fixes: c199f64ff93c ("instrumentation.h: Avoid using inline asm operand modifiers")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/0417e96909b97a406323409210de7bf13df0b170.1636410380.git.jpoimboe@redhat.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: x86@kernel.org
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Miroslav Benes <mbenes@suse.cz>
---
 include/linux/compiler.h        | 4 ++--
 include/linux/instrumentation.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 3d5af56337bd..429dcebe2b99 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -121,7 +121,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	asm volatile(__stringify_label(c) ":\n\t"			\
 		     ".pushsection .discard.reachable\n\t"		\
 		     ".long " __stringify_label(c) "b - .\n\t"		\
-		     ".popsection\n\t");				\
+		     ".popsection\n\t" : : "i" (c));			\
 })
 #define annotate_reachable() __annotate_reachable(__COUNTER__)
 
@@ -129,7 +129,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	asm volatile(__stringify_label(c) ":\n\t"			\
 		     ".pushsection .discard.unreachable\n\t"		\
 		     ".long " __stringify_label(c) "b - .\n\t"		\
-		     ".popsection\n\t");				\
+		     ".popsection\n\t" : : "i" (c));			\
 })
 #define annotate_unreachable() __annotate_unreachable(__COUNTER__)
 
diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h
index fa2cd8c63dcc..24359b4a9605 100644
--- a/include/linux/instrumentation.h
+++ b/include/linux/instrumentation.h
@@ -11,7 +11,7 @@
 	asm volatile(__stringify(c) ": nop\n\t"				\
 		     ".pushsection .discard.instr_begin\n\t"		\
 		     ".long " __stringify(c) "b - .\n\t"		\
-		     ".popsection\n\t");				\
+		     ".popsection\n\t" : : "i" (c));			\
 })
 #define instrumentation_begin() __instrumentation_begin(__COUNTER__)
 
@@ -50,7 +50,7 @@
 	asm volatile(__stringify(c) ": nop\n\t"				\
 		     ".pushsection .discard.instr_end\n\t"		\
 		     ".long " __stringify(c) "b - .\n\t"		\
-		     ".popsection\n\t");				\
+		     ".popsection\n\t" : : "i" (c));			\
 })
 #define instrumentation_end() __instrumentation_end(__COUNTER__)
 #else
-- 
cgit v1.2.3


From f857acfc457ea63fa5b862d77f055665d863acfe Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Wed, 17 Nov 2021 14:53:48 -0700
Subject: lib/scatterlist: cleanup macros into static inline functions

Convert the sg_is_chain(), sg_is_last() and sg_chain_ptr() macros
into static inline functions. There's no reason for these to be macros
and static inline are generally preferred these days.

Also introduce the SG_PAGE_LINK_MASK define so the P2PDMA work, which is
adding another bit to this mask, can do so more easily.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/scatterlist.h | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 266754a55327..7ff9d6386c12 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -69,10 +69,27 @@ struct sg_append_table {
  * a valid sg entry, or whether it points to the start of a new scatterlist.
  * Those low bits are there for everyone! (thanks mason :-)
  */
-#define sg_is_chain(sg)		((sg)->page_link & SG_CHAIN)
-#define sg_is_last(sg)		((sg)->page_link & SG_END)
-#define sg_chain_ptr(sg)	\
-	((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END)))
+#define SG_PAGE_LINK_MASK (SG_CHAIN | SG_END)
+
+static inline unsigned int __sg_flags(struct scatterlist *sg)
+{
+	return sg->page_link & SG_PAGE_LINK_MASK;
+}
+
+static inline struct scatterlist *sg_chain_ptr(struct scatterlist *sg)
+{
+	return (struct scatterlist *)(sg->page_link & ~SG_PAGE_LINK_MASK);
+}
+
+static inline bool sg_is_chain(struct scatterlist *sg)
+{
+	return __sg_flags(sg) & SG_CHAIN;
+}
+
+static inline bool sg_is_last(struct scatterlist *sg)
+{
+	return __sg_flags(sg) & SG_END;
+}
 
 /**
  * sg_assign_page - Assign a given page to an SG entry
@@ -92,7 +109,7 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
 	 * In order for the low bit stealing approach to work, pages
 	 * must be aligned at a 32-bit boundary as a minimum.
 	 */
-	BUG_ON((unsigned long) page & (SG_CHAIN | SG_END));
+	BUG_ON((unsigned long)page & SG_PAGE_LINK_MASK);
 #ifdef CONFIG_DEBUG_SG
 	BUG_ON(sg_is_chain(sg));
 #endif
@@ -126,7 +143,7 @@ static inline struct page *sg_page(struct scatterlist *sg)
 #ifdef CONFIG_DEBUG_SG
 	BUG_ON(sg_is_chain(sg));
 #endif
-	return (struct page *)((sg)->page_link & ~(SG_CHAIN | SG_END));
+	return (struct page *)((sg)->page_link & ~SG_PAGE_LINK_MASK);
 }
 
 /**
-- 
cgit v1.2.3


From 365481e42a8a95c55e43e8cc236138718e762e7b Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 7 Dec 2021 17:50:11 -0800
Subject: driver core: auxiliary bus: Add driver data helpers

Adds get/set driver data helpers for auxiliary devices.

Reviewed-by: Mark Gross <markgross@kernel.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20211208015015.891275-3-david.e.box@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/auxiliary_bus.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h
index e6d8b5c16226..de21d9d24a95 100644
--- a/include/linux/auxiliary_bus.h
+++ b/include/linux/auxiliary_bus.h
@@ -188,6 +188,16 @@ struct auxiliary_driver {
 	const struct auxiliary_device_id *id_table;
 };
 
+static inline void *auxiliary_get_drvdata(struct auxiliary_device *auxdev)
+{
+	return dev_get_drvdata(&auxdev->dev);
+}
+
+static inline void auxiliary_set_drvdata(struct auxiliary_device *auxdev, void *data)
+{
+	dev_set_drvdata(&auxdev->dev, data);
+}
+
 static inline struct auxiliary_device *to_auxiliary_dev(struct device *dev)
 {
 	return container_of(dev, struct auxiliary_device, dev);
-- 
cgit v1.2.3


From b398123bff3bcbc1facb0f29bf6e7b9f1bc55931 Mon Sep 17 00:00:00 2001
From: Pingfan Liu <kernelfans@gmail.com>
Date: Wed, 15 Dec 2021 10:13:48 +0800
Subject: efi: apply memblock cap after memblock_add()

On arm64, during kdump kernel saves vmcore, it runs into the following bug:
...
[   15.148919] usercopy: Kernel memory exposure attempt detected from SLUB object 'kmem_cache_node' (offset 0, size 4096)!
[   15.159707] ------------[ cut here ]------------
[   15.164311] kernel BUG at mm/usercopy.c:99!
[   15.168482] Internal error: Oops - BUG: 0 [#1] SMP
[   15.173261] Modules linked in: xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 sha1_ce sbsa_gwdt ast i2c_algo_bit drm_vram_helper drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops cec drm_ttm_helper ttm drm nvme nvme_core xgene_hwmon i2c_designware_platform i2c_designware_core dm_mirror dm_region_hash dm_log dm_mod overlay squashfs zstd_decompress loop
[   15.206186] CPU: 0 PID: 542 Comm: cp Not tainted 5.16.0-rc4 #1
[   15.212006] Hardware name: GIGABYTE R272-P30-JG/MP32-AR0-JG, BIOS F12 (SCP: 1.5.20210426) 05/13/2021
[   15.221125] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   15.228073] pc : usercopy_abort+0x9c/0xa0
[   15.232074] lr : usercopy_abort+0x9c/0xa0
[   15.236070] sp : ffff8000121abba0
[   15.239371] x29: ffff8000121abbb0 x28: 0000000000003000 x27: 0000000000000000
[   15.246494] x26: 0000000080000400 x25: 0000ffff885c7000 x24: 0000000000000000
[   15.253617] x23: 000007ff80400000 x22: ffff07ff80401000 x21: 0000000000000001
[   15.260739] x20: 0000000000001000 x19: ffff07ff80400000 x18: ffffffffffffffff
[   15.267861] x17: 656a626f2042554c x16: 53206d6f72662064 x15: 6574636574656420
[   15.274983] x14: 74706d6574746120 x13: 2129363930342065 x12: 7a6973202c302074
[   15.282105] x11: ffffc8b041d1b148 x10: 00000000ffff8000 x9 : ffffc8b04012812c
[   15.289228] x8 : 00000000ffff7fff x7 : ffffc8b041d1b148 x6 : 0000000000000000
[   15.296349] x5 : 0000000000000000 x4 : 0000000000007fff x3 : 0000000000000000
[   15.303471] x2 : 0000000000000000 x1 : ffff07ff8c064800 x0 : 000000000000006b
[   15.310593] Call trace:
[   15.313027]  usercopy_abort+0x9c/0xa0
[   15.316677]  __check_heap_object+0xd4/0xf0
[   15.320762]  __check_object_size.part.0+0x160/0x1e0
[   15.325628]  __check_object_size+0x2c/0x40
[   15.329711]  copy_oldmem_page+0x7c/0x140
[   15.333623]  read_from_oldmem.part.0+0xfc/0x1c0
[   15.338142]  __read_vmcore.constprop.0+0x23c/0x350
[   15.342920]  read_vmcore+0x28/0x34
[   15.346309]  proc_reg_read+0xb4/0xf0
[   15.349871]  vfs_read+0xb8/0x1f0
[   15.353088]  ksys_read+0x74/0x100
[   15.356390]  __arm64_sys_read+0x28/0x34
...

This bug introduced by commit b261dba2fdb2 ("arm64: kdump: Remove custom
linux,usable-memory-range handling"), which moves
memblock_cap_memory_range() to fdt, but it breaches the rules that
memblock_cap_memory_range() should come after memblock_add() etc as said
in commit e888fa7bb882 ("memblock: Check memory add/cap ordering").

As a consequence, the virtual address set up by copy_oldmem_page() does
not bail out from the test of virt_addr_valid() in check_heap_object(),
and finally hits the BUG_ON().

Since memblock allocator has no idea about when the memblock is fully
populated, while efi_init() is aware, so tackling this issue by calling the
interface early_init_dt_check_for_usable_mem_range() exposed by of/fdt.

Fixes: b261dba2fdb2 ("arm64: kdump: Remove custom linux,usable-memory-range handling")
Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Geert Uytterhoeven <geert+renesas@glider.be>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Nick Terrell <terrelln@fb.com>
Cc: linux-arm-kernel@lists.infradead.org
To: devicetree@vger.kernel.org
To: linux-efi@vger.kernel.org
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211215021348.8766-1-kernelfans@gmail.com
---
 include/linux/of_fdt.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index cf48983d3c86..ad09beb6d13c 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -62,6 +62,7 @@ extern int early_init_dt_scan_chosen(unsigned long node, const char *uname,
 				     int depth, void *data);
 extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
 				     int depth, void *data);
+extern void early_init_dt_check_for_usable_mem_range(void);
 extern int early_init_dt_scan_chosen_stdout(void);
 extern void early_init_fdt_scan_reserved_mem(void);
 extern void early_init_fdt_reserve_self(void);
@@ -86,6 +87,7 @@ extern void unflatten_and_copy_device_tree(void);
 extern void early_init_devtree(void *);
 extern void early_get_first_memblock_info(void *, phys_addr_t *);
 #else /* CONFIG_OF_EARLY_FLATTREE */
+static inline void early_init_dt_check_for_usable_mem_range(void) {}
 static inline int early_init_dt_scan_chosen_stdout(void) { return -ENODEV; }
 static inline void early_init_fdt_scan_reserved_mem(void) {}
 static inline void early_init_fdt_reserve_self(void) {}
-- 
cgit v1.2.3


From f2f8115fe8b390af27d013411045bd712a812103 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@kernel.org>
Date: Tue, 21 Dec 2021 15:17:56 +0200
Subject: memory: omap-gpmc: Use a compatible match table when checking for
 NAND controller

As more compatibles can be added to the GPMC NAND controller driver
use a compatible match table.

Signed-off-by: Roger Quadros <rogerq@kernel.org>
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20211221131757.2030-4-rogerq@kernel.org
[krzysztof: remove "is_nand" variable]
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
---
 include/linux/platform_data/mtd-nand-omap2.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index de6ada739121..92f011805ad4 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -7,6 +7,7 @@
 #define	_MTD_NAND_OMAP2_H
 
 #include <linux/mtd/partitions.h>
+#include <linux/mod_devicetable.h>
 
 #define	GPMC_BCH_NUM_REMAINDER	8
 
@@ -61,4 +62,10 @@ struct gpmc_nand_regs {
 	void __iomem	*gpmc_bch_result5[GPMC_BCH_NUM_REMAINDER];
 	void __iomem	*gpmc_bch_result6[GPMC_BCH_NUM_REMAINDER];
 };
-#endif
+
+static const struct of_device_id omap_nand_ids[] = {
+	{ .compatible = "ti,omap2-nand", },
+	{},
+};
+
+#endif /* _MTD_NAND_OMAP2_H */
-- 
cgit v1.2.3


From d7f55471db2719629f773c2d6b5742a69595bfd3 Mon Sep 17 00:00:00 2001
From: Jackie Liu <liuyun01@kylinos.cn>
Date: Fri, 17 Dec 2021 10:07:54 +0800
Subject: memblock: fix memblock_phys_alloc() section mismatch error

Fix modpost Section mismatch error in memblock_phys_alloc()

[...]
WARNING: modpost: vmlinux.o(.text.unlikely+0x1dcc): Section mismatch in reference
from the function memblock_phys_alloc() to the function .init.text:memblock_phys_alloc_range()
The function memblock_phys_alloc() references
the function __init memblock_phys_alloc_range().
This is often because memblock_phys_alloc lacks a __init
annotation or the annotation of memblock_phys_alloc_range is wrong.

ERROR: modpost: Section mismatches detected.
Set CONFIG_SECTION_MISMATCH_WARN_ONLY=y to allow them.
[...]

memblock_phys_alloc() is a one-line wrapper, make it __always_inline to
avoid these section mismatches.

Reported-by: k2ci <kernel-bot@kylinos.cn>
Suggested-by: Mike Rapoport <rppt@kernel.org>
Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
[rppt: slightly massaged changelog ]
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Link: https://lore.kernel.org/r/20211217020754.2874872-1-liu.yun@linux.dev
---
 include/linux/memblock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 8adcf1fa8096..9dc7cb239d21 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -405,8 +405,8 @@ phys_addr_t memblock_alloc_range_nid(phys_addr_t size,
 				      phys_addr_t end, int nid, bool exact_nid);
 phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
 
-static inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
-					      phys_addr_t align)
+static __always_inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
+						       phys_addr_t align)
 {
 	return memblock_phys_alloc_range(size, align, 0,
 					 MEMBLOCK_ALLOC_ACCESSIBLE);
-- 
cgit v1.2.3


From 6fd3c510ee4b37f2f9fe3d3cafbfa459e15c5e11 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 22 Dec 2021 13:15:32 -0800
Subject: bio.h: fix kernel-doc warnings

Fix all kernel-doc warnings in <linux/bio.h>:

include/linux/bio.h:136: warning: Function parameter or member 'nbytes' not described in 'bio_advance'
include/linux/bio.h:136: warning: Excess function parameter 'bytes' description in 'bio_advance'
include/linux/bio.h:391: warning: No description found for return value of 'bio_next_split'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Link: https://lore.kernel.org/r/20211222211532.24060-1-rdunlap@infradead.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index fe6bdfbbef66..0a41efe02208 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -124,7 +124,7 @@ void __bio_advance(struct bio *, unsigned bytes);
 /**
  * bio_advance - increment/complete a bio by some number of bytes
  * @bio:	bio to advance
- * @bytes:	number of bytes to complete
+ * @nbytes:	number of bytes to complete
  *
  * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
  * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
@@ -332,7 +332,7 @@ extern struct bio *bio_split(struct bio *bio, int sectors,
  * @gfp:	gfp mask
  * @bs:		bio set to allocate from
  *
- * Returns a bio representing the next @sectors of @bio - if the bio is smaller
+ * Return: a bio representing the next @sectors of @bio - if the bio is smaller
  * than @sectors, returns the original bio unchanged.
  */
 static inline struct bio *bio_next_split(struct bio *bio, int sectors,
-- 
cgit v1.2.3


From a16c7246368db8935652c805bc446928d0e1c0aa Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Wed, 22 Dec 2021 13:52:39 -0800
Subject: block: remove unnecessary trailing '\'

While harmless, the blank line is certainly not intended to be part of
the rq_list_for_each() macro. Remove it.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20211222215239.1768164-1-kbusch@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bb5fb7282e6e..22746b2d6825 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1363,7 +1363,7 @@ struct io_comp_batch {
 })
 
 #define rq_list_for_each(listptr, pos)			\
-	for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) \
+	for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos))
 
 #define rq_list_next(rq)	(rq)->rq_next
 #define rq_list_empty(list)	((list) == (struct request *) NULL)
-- 
cgit v1.2.3


From 6e1fcab00a23f7fe9f4fe9704905a790efa1eeab Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 20 Dec 2021 19:21:26 +0800
Subject: scsi: block: pm: Always set request queue runtime active in
 blk_post_runtime_resume()

John Garry reported a deadlock that occurs when trying to access a
runtime-suspended SATA device.  For obscure reasons, the rescan procedure
causes the link to be hard-reset, which disconnects the device.

The rescan tries to carry out a runtime resume when accessing the device.
scsi_rescan_device() holds the SCSI device lock and won't release it until
it can put commands onto the device's block queue.  This can't happen until
the queue is successfully runtime-resumed or the device is unregistered.
But the runtime resume fails because the device is disconnected, and
__scsi_remove_device() can't do the unregistration because it can't get the
device lock.

The best way to resolve this deadlock appears to be to allow the block
queue to start running again even after an unsuccessful runtime resume.
The idea is that the driver or the SCSI error handler will need to be able
to use the queue to resolve the runtime resume failure.

This patch removes the err argument to blk_post_runtime_resume() and makes
the routine act as though the resume was successful always.  This fixes the
deadlock.

Link: https://lore.kernel.org/r/1639999298-244569-4-git-send-email-chenxiang66@hisilicon.com
Fixes: e27829dc92e5 ("scsi: serialize ->rescan against ->remove")
Reported-and-tested-by: John Garry <john.garry@huawei.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blk-pm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-pm.h b/include/linux/blk-pm.h
index b80c65aba249..2580e05a8ab6 100644
--- a/include/linux/blk-pm.h
+++ b/include/linux/blk-pm.h
@@ -14,7 +14,7 @@ extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev);
 extern int blk_pre_runtime_suspend(struct request_queue *q);
 extern void blk_post_runtime_suspend(struct request_queue *q, int err);
 extern void blk_pre_runtime_resume(struct request_queue *q);
-extern void blk_post_runtime_resume(struct request_queue *q, int err);
+extern void blk_post_runtime_resume(struct request_queue *q);
 extern void blk_set_runtime_active(struct request_queue *q);
 #else
 static inline void blk_pm_runtime_init(struct request_queue *q,
-- 
cgit v1.2.3


From 30be4551f9e26292599e666985119a5b559a2e4a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 22 Dec 2021 18:32:56 +0200
Subject: wwan: Replace kernel.h with the necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/wwan.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index e143c88bf4b0..afb3334ec8c5 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -4,12 +4,9 @@
 #ifndef __WWAN_H
 #define __WWAN_H
 
-#include <linux/device.h>
-#include <linux/kernel.h>
 #include <linux/poll.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
 #include <linux/netdevice.h>
+#include <linux/types.h>
 
 /**
  * enum wwan_port_type - WWAN port types
@@ -37,6 +34,10 @@ enum wwan_port_type {
 	WWAN_PORT_UNKNOWN,
 };
 
+struct device;
+struct file;
+struct netlink_ext_ack;
+struct sk_buff;
 struct wwan_port;
 
 /** struct wwan_port_ops - The WWAN port operations
-- 
cgit v1.2.3


From 7da1d1ddd1f02e5de7497a0c849256912652fb6c Mon Sep 17 00:00:00 2001
From: Nick Child <nick.child@ibm.com>
Date: Thu, 16 Dec 2021 17:00:35 -0500
Subject: cuda/pmu: Make find_via_cuda/pmu init functions

Make `find_via_cuda` and `find_via_pmu` initialization functions.
Previously, their definitions in `drivers/macintosh/via-cuda.h` include
the `__init` attribute but their alternative definitions in
`arch/powerpc/powermac/sectup./c` and prototypes in `include/linux/
cuda.h` and `include/linux/pmu.h` do not use the `__init` macro. Since,
only initialization functions call `find_via_cuda` and `find_via_pmu`
it is safe to label these functions with `__init`.

Signed-off-by: Nick Child <nick.child@ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20211216220035.605465-21-nick.child@ibm.com
---
 include/linux/cuda.h | 2 +-
 include/linux/pmu.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cuda.h b/include/linux/cuda.h
index 45bfe9d61271..daf3e6f98444 100644
--- a/include/linux/cuda.h
+++ b/include/linux/cuda.h
@@ -12,7 +12,7 @@
 #include <uapi/linux/cuda.h>
 
 
-extern int find_via_cuda(void);
+extern int __init find_via_cuda(void);
 extern int cuda_request(struct adb_request *req,
 			void (*done)(struct adb_request *), int nbytes, ...);
 extern void cuda_poll(void);
diff --git a/include/linux/pmu.h b/include/linux/pmu.h
index 52453a24a24f..c677442d007c 100644
--- a/include/linux/pmu.h
+++ b/include/linux/pmu.h
@@ -13,7 +13,7 @@
 #include <uapi/linux/pmu.h>
 
 
-extern int find_via_pmu(void);
+extern int __init find_via_pmu(void);
 
 extern int pmu_request(struct adb_request *req,
 		void (*done)(struct adb_request *), int nbytes, ...);
-- 
cgit v1.2.3


From 66b354064a35b6379963cba27b5d37a278fc9bd9 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 23 Nov 2021 11:16:00 +0100
Subject: powercap/drivers/dtpm: Remove unused function definition

The dtpm.h header file is exporting a function which is not
implemented neither needed. Remove it.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20211123101601.2433340-1-daniel.lezcano@linaro.org
---
 include/linux/dtpm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h
index 2890f6370eb9..d37e5d06a357 100644
--- a/include/linux/dtpm.h
+++ b/include/linux/dtpm.h
@@ -70,6 +70,4 @@ void dtpm_unregister(struct dtpm *dtpm);
 
 int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent);
 
-int dtpm_register_cpu(struct dtpm *parent);
-
 #endif
-- 
cgit v1.2.3


From dd123e62bdedcd3a486e48e883ec63138ec2c14c Mon Sep 17 00:00:00 2001
From: Henning Schild <henning.schild@siemens.com>
Date: Mon, 13 Dec 2021 13:04:59 +0100
Subject: platform/x86: simatic-ipc: add main driver for Siemens devices

This mainly implements detection of these devices and will allow
secondary drivers to work on such machines.

The identification is DMI-based with a vendor specific way to tell them
apart in a reliable way.

Drivers for LEDs and Watchdogs will follow to make use of that platform
detection.

There is also some code to allow secondary drivers to find GPIO memory,
that needs to be in place because the pinctrl drivers do not come up.

Signed-off-by: Henning Schild <henning.schild@siemens.com>
Link: https://lore.kernel.org/r/20211213120502.20661-2-henning.schild@siemens.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/simatic-ipc-base.h | 29 +++++++++
 include/linux/platform_data/x86/simatic-ipc.h      | 72 ++++++++++++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 include/linux/platform_data/x86/simatic-ipc-base.h
 create mode 100644 include/linux/platform_data/x86/simatic-ipc.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/simatic-ipc-base.h b/include/linux/platform_data/x86/simatic-ipc-base.h
new file mode 100644
index 000000000000..62d2bc774067
--- /dev/null
+++ b/include/linux/platform_data/x86/simatic-ipc-base.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Siemens SIMATIC IPC drivers
+ *
+ * Copyright (c) Siemens AG, 2018-2021
+ *
+ * Authors:
+ *  Henning Schild <henning.schild@siemens.com>
+ *  Gerd Haeussler <gerd.haeussler.ext@siemens.com>
+ */
+
+#ifndef __PLATFORM_DATA_X86_SIMATIC_IPC_BASE_H
+#define __PLATFORM_DATA_X86_SIMATIC_IPC_BASE_H
+
+#include <linux/types.h>
+
+#define SIMATIC_IPC_DEVICE_NONE 0
+#define SIMATIC_IPC_DEVICE_227D 1
+#define SIMATIC_IPC_DEVICE_427E 2
+#define SIMATIC_IPC_DEVICE_127E 3
+#define SIMATIC_IPC_DEVICE_227E 4
+
+struct simatic_ipc_platform {
+	u8	devmode;
+};
+
+u32 simatic_ipc_get_membase0(unsigned int p2sb);
+
+#endif /* __PLATFORM_DATA_X86_SIMATIC_IPC_BASE_H */
diff --git a/include/linux/platform_data/x86/simatic-ipc.h b/include/linux/platform_data/x86/simatic-ipc.h
new file mode 100644
index 000000000000..f3b76b39776b
--- /dev/null
+++ b/include/linux/platform_data/x86/simatic-ipc.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Siemens SIMATIC IPC drivers
+ *
+ * Copyright (c) Siemens AG, 2018-2021
+ *
+ * Authors:
+ *  Henning Schild <henning.schild@siemens.com>
+ *  Gerd Haeussler <gerd.haeussler.ext@siemens.com>
+ */
+
+#ifndef __PLATFORM_DATA_X86_SIMATIC_IPC_H
+#define __PLATFORM_DATA_X86_SIMATIC_IPC_H
+
+#include <linux/dmi.h>
+#include <linux/platform_data/x86/simatic-ipc-base.h>
+
+#define SIMATIC_IPC_DMI_ENTRY_OEM	129
+/* binary type */
+#define SIMATIC_IPC_DMI_TYPE		0xff
+#define SIMATIC_IPC_DMI_GROUP		0x05
+#define SIMATIC_IPC_DMI_ENTRY		0x02
+#define SIMATIC_IPC_DMI_TID		0x02
+
+enum simatic_ipc_station_ids {
+	SIMATIC_IPC_INVALID_STATION_ID = 0,
+	SIMATIC_IPC_IPC227D = 0x00000501,
+	SIMATIC_IPC_IPC427D = 0x00000701,
+	SIMATIC_IPC_IPC227E = 0x00000901,
+	SIMATIC_IPC_IPC277E = 0x00000902,
+	SIMATIC_IPC_IPC427E = 0x00000A01,
+	SIMATIC_IPC_IPC477E = 0x00000A02,
+	SIMATIC_IPC_IPC127E = 0x00000D01,
+};
+
+static inline u32 simatic_ipc_get_station_id(u8 *data, int max_len)
+{
+	struct {
+		u8	type;		/* type (0xff = binary) */
+		u8	len;		/* len of data entry */
+		u8	group;
+		u8	entry;
+		u8	tid;
+		__le32	station_id;	/* station id (LE) */
+	} __packed * data_entry = (void *)data + sizeof(struct dmi_header);
+
+	while ((u8 *)data_entry < data + max_len) {
+		if (data_entry->type == SIMATIC_IPC_DMI_TYPE &&
+		    data_entry->len == sizeof(*data_entry) &&
+		    data_entry->group == SIMATIC_IPC_DMI_GROUP &&
+		    data_entry->entry == SIMATIC_IPC_DMI_ENTRY &&
+		    data_entry->tid == SIMATIC_IPC_DMI_TID) {
+			return le32_to_cpu(data_entry->station_id);
+		}
+		data_entry = (void *)((u8 *)(data_entry) + data_entry->len);
+	}
+
+	return SIMATIC_IPC_INVALID_STATION_ID;
+}
+
+static inline void
+simatic_ipc_find_dmi_entry_helper(const struct dmi_header *dh, void *_data)
+{
+	u32 *id = _data;
+
+	if (dh->type != SIMATIC_IPC_DMI_ENTRY_OEM)
+		return;
+
+	*id = simatic_ipc_get_station_id((u8 *)dh, dh->length);
+}
+
+#endif /* __PLATFORM_DATA_X86_SIMATIC_IPC_H */
-- 
cgit v1.2.3


From b86947b52f0d0e5b6e6f0510933ca13aad266e47 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Fri, 24 Dec 2021 10:10:29 +0800
Subject: ASoC/soundwire: intel: simplify callbacks for params/hw_free

We don't really need to pass a substream to the callback, we only need
the direction. No functionality change, only simplification to enable
improve suspend with paused streams.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Acked-By: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20211224021034.26635-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 8a463b8fc12a..67e0d3e750b5 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -92,7 +92,7 @@
  * firmware.
  */
 struct sdw_intel_stream_params_data {
-	struct snd_pcm_substream *substream;
+	int stream;
 	struct snd_soc_dai *dai;
 	struct snd_pcm_hw_params *hw_params;
 	int link_id;
@@ -105,7 +105,7 @@ struct sdw_intel_stream_params_data {
  * firmware.
  */
 struct sdw_intel_stream_free_data {
-	struct snd_pcm_substream *substream;
+	int stream;
 	struct snd_soc_dai *dai;
 	int link_id;
 };
-- 
cgit v1.2.3


From 54bf7fa3efd08eea03e4bac04e188ee3db6173a7 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 13 Dec 2021 17:11:45 +0100
Subject: ima: Fix undefined arch_ima_get_secureboot() and co

Currently arch_ima_get_secureboot() and arch_get_ima_policy() are
defined only when CONFIG_IMA is set, and this makes any code calling
those functions without CONFIG_IMA fail.

Move the declaration and the dummy definition of those functions
outside ifdef-CONFIG_IMA block for fixing the undefined symbols.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
[zohar@linux.ibm.com: removed in-tree/out-of-tree comment in patch description]
Reviewed-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/ima.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index b6ab66a546ae..426b1744215e 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -50,21 +50,6 @@ static inline void ima_appraise_parse_cmdline(void) {}
 extern void ima_add_kexec_buffer(struct kimage *image);
 #endif
 
-#ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT
-extern bool arch_ima_get_secureboot(void);
-extern const char * const *arch_get_ima_policy(void);
-#else
-static inline bool arch_ima_get_secureboot(void)
-{
-	return false;
-}
-
-static inline const char * const *arch_get_ima_policy(void)
-{
-	return NULL;
-}
-#endif
-
 #else
 static inline enum hash_algo ima_get_current_hash_algo(void)
 {
@@ -155,6 +140,21 @@ static inline int ima_measure_critical_data(const char *event_label,
 
 #endif /* CONFIG_IMA */
 
+#ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT
+extern bool arch_ima_get_secureboot(void);
+extern const char * const *arch_get_ima_policy(void);
+#else
+static inline bool arch_ima_get_secureboot(void)
+{
+	return false;
+}
+
+static inline const char * const *arch_get_ima_policy(void)
+{
+	return NULL;
+}
+#endif
+
 #ifndef CONFIG_IMA_KEXEC
 struct kimage;
 
-- 
cgit v1.2.3


From 94ab10dd42a70acc5208a41325617e3d9cf81a70 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 24 Dec 2021 21:12:48 -0800
Subject: mm: delete unsafe BUG from page_cache_add_speculative()

It is not easily reproducible, but on 5.16-rc I have several times hit
the VM_BUG_ON_PAGE(PageTail(page), page) in
page_cache_add_speculative(): usually from filemap_get_read_batch() for
an ext4 read, yesterday from next_uptodate_page() from
filemap_map_pages() for a shmem fault.

That BUG used to be placed where page_ref_add_unless() had succeeded,
but now it is placed before folio_ref_add_unless() is attempted: that is
not safe, since it is only the acquired reference which makes the page
safe from racing THP collapse or split.

We could keep the BUG, checking PageTail only when
folio_ref_try_add_rcu() has succeeded; but I don't think it adds much
value - just delete it.

Link: https://lkml.kernel.org/r/8b98fc6f-3439-8614-c3f3-945c659a1aba@google.com
Fixes: 020853b6f5ea ("mm: Add folio_try_get_rcu()")
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 605246452305..d150a9082b31 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -285,7 +285,6 @@ static inline struct inode *folio_inode(struct folio *folio)
 
 static inline bool page_cache_add_speculative(struct page *page, int count)
 {
-	VM_BUG_ON_PAGE(PageTail(page), page);
 	return folio_ref_try_add_rcu((struct folio *)page, count);
 }
 
-- 
cgit v1.2.3


From 595ec1973c276f6c0c1de8aca5eef8dfd81f9b49 Mon Sep 17 00:00:00 2001
From: Thibaut Sautereau <thibaut.sautereau@ssi.gouv.fr>
Date: Fri, 24 Dec 2021 21:12:51 -0800
Subject: mm/page_alloc: fix __alloc_size attribute for alloc_pages_exact_nid

The second parameter of alloc_pages_exact_nid is the one indicating the
size of memory pointed by the returned pointer.

Link: https://lkml.kernel.org/r/YbjEgwhn4bGblp//@coeus
Fixes: abd58f38dfb4 ("mm/page_alloc: add __alloc_size attributes for better bounds checking")
Signed-off-by: Thibaut Sautereau <thibaut.sautereau@ssi.gouv.fr>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Daniel Micay <danielmicay@gmail.com>
Cc: Levente Polyak <levente@leventepolyak.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b976c4177299..8fcc38467af6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -624,7 +624,7 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask);
 
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1);
 void free_pages_exact(void *virt, size_t size);
-__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(1);
+__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
 
 #define __get_free_page(gfp_mask) \
 		__get_free_pages((gfp_mask), 0)
-- 
cgit v1.2.3


From 4fb0abfee424b05f0ec6d2d09e38f04ee2b82a8a Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Mon, 8 Nov 2021 15:51:21 -0600
Subject: x86/amd_nb: Add AMD Family 19h Models (10h-1Fh) and (A0h-AFh) PCI IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the new PCI Device IDs to support new generation of AMD 19h family of
processors.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Acked-by: Krzysztof Wilczyński <kw@linux.com>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>  # pci_ids.h
Link: https://lore.kernel.org/r/163640828133.955062.18349019796157170473.stgit@bmoger-ubuntu
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 011f2f1ea5bb..b5248f27910e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -555,6 +555,7 @@
 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b
 #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443
 #define PCI_DEVICE_ID_AMD_19H_DF_F3	0x1653
+#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F3 0x14b0
 #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F3 0x167c
 #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d
 #define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
-- 
cgit v1.2.3


From 11a24ca7e34d968991a7d437b950d1924396bd81 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 25 Nov 2021 03:08:38 +0100
Subject: hwmon: (ntc_thermistor) Merge platform data into driver

Platform data is supposed to be used with "board files",
device descriptions in C. Since the introduction of the
NTC driver in 2011, no such platforms have been submitted
to the Linux kernel, and their use is strongly discouraged
in favor of Device Tree, ACPI or as last resort software
firmware nodes.

Drop the external header and copy the platform data into
the driver file.

Cc: Peter Rosin <peda@axentia.se>
Cc: Chris Lesiak <chris.lesiak@licor.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20211125020841.3616359-2-linus.walleij@linaro.org
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/platform_data/ntc_thermistor.h | 50 ----------------------------
 1 file changed, 50 deletions(-)
 delete mode 100644 include/linux/platform_data/ntc_thermistor.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/ntc_thermistor.h b/include/linux/platform_data/ntc_thermistor.h
deleted file mode 100644
index b324d03e580c..000000000000
--- a/include/linux/platform_data/ntc_thermistor.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ntc_thermistor.h - NTC Thermistors
- *
- *  Copyright (C) 2010 Samsung Electronics
- *  MyungJoo Ham <myungjoo.ham@samsung.com>
- */
-#ifndef _LINUX_NTC_H
-#define _LINUX_NTC_H
-
-struct iio_channel;
-
-enum ntc_thermistor_type {
-	TYPE_B57330V2103,
-	TYPE_B57891S0103,
-	TYPE_NCPXXWB473,
-	TYPE_NCPXXWF104,
-	TYPE_NCPXXWL333,
-	TYPE_NCPXXXH103,
-};
-
-struct ntc_thermistor_platform_data {
-	/*
-	 * One (not both) of read_uV and read_ohm should be provided and only
-	 * one of the two should be provided.
-	 * Both functions should return negative value for an error case.
-	 *
-	 * pullup_uV, pullup_ohm, pulldown_ohm, and connect are required to use
-	 * read_uV()
-	 *
-	 * How to setup pullup_ohm, pulldown_ohm, and connect is
-	 * described at Documentation/hwmon/ntc_thermistor.rst
-	 *
-	 * pullup/down_ohm: 0 for infinite / not-connected
-	 *
-	 * chan: iio_channel pointer to communicate with the ADC which the
-	 * thermistor is using for conversion of the analog values.
-	 */
-	int (*read_uv)(struct ntc_thermistor_platform_data *);
-	unsigned int pullup_uv;
-
-	unsigned int pullup_ohm;
-	unsigned int pulldown_ohm;
-	enum { NTC_CONNECTED_POSITIVE, NTC_CONNECTED_GROUND } connect;
-	struct iio_channel *chan;
-
-	int (*read_ohm)(void);
-};
-
-#endif /* _LINUX_NTC_H */
-- 
cgit v1.2.3


From 130d168866a11829b844ffdb19b9aefe384f754c Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 16 Dec 2021 16:42:57 +0100
Subject: hwmon: prefix kernel-doc comments for structs with struct

The command ./scripts/kernel-doc -none include/linux/hwmon.h warns:

  include/linux/hwmon.h:406: warning: This comment starts with '/**', but
    isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
   * Channel information
  include/linux/hwmon.h:425: warning: This comment starts with '/**', but
    isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
   * Chip configuration

Address those kernel-doc warnings by prefixing kernel-doc descriptions for
structs with the keyword 'struct'.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Link: https://lore.kernel.org/r/20211216154257.26758-1-lukas.bulwahn@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/hwmon.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index 1e8d6ea8992e..fad1f1df26df 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -403,7 +403,7 @@ struct hwmon_ops {
 };
 
 /**
- * Channel information
+ * struct hwmon_channel_info - Channel information
  * @type:	Channel type.
  * @config:	Pointer to NULL-terminated list of channel parameters.
  *		Use for per-channel attributes.
@@ -422,7 +422,7 @@ struct hwmon_channel_info {
 	})
 
 /**
- * Chip configuration
+ * struct hwmon_chip_info - Chip configuration
  * @ops:	Pointer to hwmon operations.
  * @info:	Null-terminated list of channel information.
  */
-- 
cgit v1.2.3


From ee6d3dd4ed48ab24b74bab3c3977b8218518247d Mon Sep 17 00:00:00 2001
From: Wedson Almeida Filho <wedsonaf@google.com>
Date: Fri, 24 Dec 2021 23:13:45 +0000
Subject: driver core: make kobj_type constant.

This way instances of kobj_type (which contain function pointers) can be
stored in .rodata, which means that they cannot be [easily/accidentally]
modified at runtime.

Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Link: https://lore.kernel.org/r/20211224231345.777370-1-wedsonaf@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kobject.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index c740062b4b1a..683172b2e094 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -66,7 +66,7 @@ struct kobject {
 	struct list_head	entry;
 	struct kobject		*parent;
 	struct kset		*kset;
-	struct kobj_type	*ktype;
+	const struct kobj_type	*ktype;
 	struct kernfs_node	*sd; /* sysfs directory entry */
 	struct kref		kref;
 #ifdef CONFIG_DEBUG_KOBJECT_RELEASE
@@ -90,13 +90,13 @@ static inline const char *kobject_name(const struct kobject *kobj)
 	return kobj->name;
 }
 
-extern void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
+extern void kobject_init(struct kobject *kobj, const struct kobj_type *ktype);
 extern __printf(3, 4) __must_check
 int kobject_add(struct kobject *kobj, struct kobject *parent,
 		const char *fmt, ...);
 extern __printf(4, 5) __must_check
 int kobject_init_and_add(struct kobject *kobj,
-			 struct kobj_type *ktype, struct kobject *parent,
+			 const struct kobj_type *ktype, struct kobject *parent,
 			 const char *fmt, ...);
 
 extern void kobject_del(struct kobject *kobj);
@@ -217,7 +217,7 @@ static inline void kset_put(struct kset *k)
 	kobject_put(&k->kobj);
 }
 
-static inline struct kobj_type *get_ktype(struct kobject *kobj)
+static inline const struct kobj_type *get_ktype(struct kobject *kobj)
 {
 	return kobj->ktype;
 }
-- 
cgit v1.2.3


From 1882de7fc56c2b0ea91dd9fd9922d434fc3feb15 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 22 Dec 2021 12:31:03 +0800
Subject: efi: Introduce EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER and
 corresponding structures

Platform Firmware Runtime Update image starts with UEFI headers, and the
headers are defined in UEFI specification, but some of them have not been
defined in the kernel yet.

For example, the header layout of a capsule file looks like this:

EFI_CAPSULE_HEADER
EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER
EFI_FIRMWARE_MANAGEMENT_CAPSULE_IMAGE_HEADER
EFI_FIRMWARE_IMAGE_AUTHENTICATION

These structures would be used by the Platform Firmware Runtime Update
driver to parse the format of capsule file to verify if the corresponding
version number is valid. In this way, if the user provides an invalid
capsule image, the kernel could be used as a guard to reject it, without
switching to the Management Mode (which might be costly).

EFI_CAPSULE_HEADER has been defined in the kernel, but the other
structures have not been defined yet, so do that. Besides,
EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER and
EFI_FIRMWARE_MANAGEMENT_CAPSULE_IMAGE_HEADER are required to be packed
in the uefi specification. For this reason, use the __packed attribute
to indicate to the compiler that the entire structure can appear
misaligned in memory (as suggested by Ard) in case one of them follows
the other directly in a capsule header.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/efi.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index dbd39b20e034..80e970f7e6f8 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -148,6 +148,52 @@ typedef struct {
 	u32 imagesize;
 } efi_capsule_header_t;
 
+/* EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER */
+struct efi_manage_capsule_header {
+	u32 ver;
+	u16 emb_drv_cnt;
+	u16 payload_cnt;
+	/*
+	 * Variable-size array of the size given by the sum of
+	 * emb_drv_cnt and payload_cnt.
+	 */
+	u64 offset_list[];
+} __packed;
+
+/* EFI_FIRMWARE_MANAGEMENT_CAPSULE_IMAGE_HEADER */
+struct efi_manage_capsule_image_header {
+	u32 ver;
+	efi_guid_t image_type_id;
+	u8 image_index;
+	u8 reserved_bytes[3];
+	u32 image_size;
+	u32 vendor_code_size;
+	/* hw_ins was introduced in version 2 */
+	u64 hw_ins;
+	/* capsule_support was introduced in version 3 */
+	u64 capsule_support;
+} __packed;
+
+/* WIN_CERTIFICATE */
+struct win_cert {
+	u32 len;
+	u16 rev;
+	u16 cert_type;
+};
+
+/* WIN_CERTIFICATE_UEFI_GUID */
+struct win_cert_uefi_guid {
+	struct win_cert	hdr;
+	efi_guid_t cert_type;
+	u8 cert_data[];
+};
+
+/* EFI_FIRMWARE_IMAGE_AUTHENTICATION */
+struct efi_image_auth {
+	u64 mon_count;
+	struct win_cert_uefi_guid auth_info;
+};
+
 /*
  * EFI capsule flags
  */
-- 
cgit v1.2.3


From cf6299b6101903c31bddb0065804b2121ed510c7 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 27 Dec 2021 17:39:24 +0100
Subject: kobject: remove kset from struct kset_uevent_ops callbacks

There is no need to pass the pointer to the kset in the struct
kset_uevent_ops callbacks as no one uses it, so just remove that pointer
entirely.

Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Wedson Almeida Filho <wedsonaf@google.com>
Link: https://lore.kernel.org/r/20211227163924.3970661-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kobject.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 683172b2e094..ad90b49824dc 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -153,10 +153,9 @@ struct kobj_uevent_env {
 };
 
 struct kset_uevent_ops {
-	int (* const filter)(struct kset *kset, struct kobject *kobj);
-	const char *(* const name)(struct kset *kset, struct kobject *kobj);
-	int (* const uevent)(struct kset *kset, struct kobject *kobj,
-		      struct kobj_uevent_env *env);
+	int (* const filter)(struct kobject *kobj);
+	const char *(* const name)(struct kobject *kobj);
+	int (* const uevent)(struct kobject *kobj, struct kobj_uevent_env *env);
 };
 
 struct kobj_attribute {
-- 
cgit v1.2.3


From d6b9c679bbac1d1d2fcac64391b4cadb91763a6f Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 12 Nov 2021 14:46:32 -0800
Subject: watchdog: bcm7038_wdt: Support platform data configuration

The BCM7038 watchdog driver needs to be able to obtain a specific clock
name on BCM63xx platforms which is the "periph" clock ticking at 50MHz.
make it possible to specify the clock name to obtain via platform data.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20211112224636.395101-4-f.fainelli@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
---
 include/linux/platform_data/bcm7038_wdt.h | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 include/linux/platform_data/bcm7038_wdt.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/bcm7038_wdt.h b/include/linux/platform_data/bcm7038_wdt.h
new file mode 100644
index 000000000000..e18cfd9ec8f9
--- /dev/null
+++ b/include/linux/platform_data/bcm7038_wdt.h
@@ -0,0 +1,8 @@
+#ifndef __BCM7038_WDT_PDATA_H
+#define __BCM7038_WDT_PDATA_H
+
+struct bcm7038_wdt_platform_data {
+	const char *clk_name;
+};
+
+#endif /* __BCM7038_WDT_PDATA_H */
-- 
cgit v1.2.3


From b92e301633f0f454aa1cfedac2e096bb9649b367 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= <j.neuschaefer@gmx.net>
Date: Sat, 18 Dec 2021 16:25:53 +0100
Subject: mfd: ntxec: Change return type of ntxec_reg8 from __be16 to u16
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Register values in NTXEC are big-endian on the I2C bus, but the regmap
subsystem handles the conversion between CPU-endian and big-endian data
internally. ntxec_reg8 should thus return u16, not __be16.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20211218152553.744615-1-j.neuschaefer@gmx.net
---
 include/linux/mfd/ntxec.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ntxec.h b/include/linux/mfd/ntxec.h
index 26ab3b8eb612..cc6f07bfa2b3 100644
--- a/include/linux/mfd/ntxec.h
+++ b/include/linux/mfd/ntxec.h
@@ -26,7 +26,7 @@ struct ntxec {
  * This convenience function converts an 8-bit value to 16-bit for use in the
  * second kind of register.
  */
-static inline __be16 ntxec_reg8(u8 value)
+static inline u16 ntxec_reg8(u8 value)
 {
 	return value << 8;
 }
-- 
cgit v1.2.3


From b6459415b384cb829f0b2a4268f211c789f6cf0b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 28 Dec 2021 16:49:13 -0800
Subject: net: Don't include filter.h from net/sock.h

sock.h is pretty heavily used (5k objects rebuilt on x86 after
it's touched). We can drop the include of filter.h from it and
add a forward declaration of struct sk_filter instead.
This decreases the number of rebuilt objects when bpf.h
is touched from ~5k to ~1k.

There's a lot of missing includes this was masking. Primarily
in networking tho, this time.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Acked-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/bpf/20211229004913.513372-1-kuba@kernel.org
---
 include/linux/bpf_local_storage.h | 1 +
 include/linux/dsa/loop.h          | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 24496bc28e7b..a2b625960ffe 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -8,6 +8,7 @@
 #define _BPF_LOCAL_STORAGE_H
 
 #include <linux/bpf.h>
+#include <linux/filter.h>
 #include <linux/rculist.h>
 #include <linux/list.h>
 #include <linux/hash.h>
diff --git a/include/linux/dsa/loop.h b/include/linux/dsa/loop.h
index 5a3470bcc8a7..b8fef35591aa 100644
--- a/include/linux/dsa/loop.h
+++ b/include/linux/dsa/loop.h
@@ -2,6 +2,7 @@
 #ifndef DSA_LOOP_H
 #define DSA_LOOP_H
 
+#include <linux/if_vlan.h>
 #include <linux/types.h>
 #include <linux/ethtool.h>
 #include <net/dsa.h>
-- 
cgit v1.2.3


From 0fe4b381a59ebc53522fce579b281a67a9e1bee6 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@kernel.org>
Date: Fri, 24 Dec 2021 15:29:15 +0000
Subject: bpf: Allow bpf_local_storage to be used by sleepable programs

Other maps like hashmaps are already available to sleepable programs.
Sleepable BPF programs run under trace RCU. Allow task, sk and inode
storage to be used from sleepable programs. This allows sleepable and
non-sleepable programs to provide shareable annotations on kernel
objects.

Sleepable programs run in trace RCU where as non-sleepable programs run
in a normal RCU critical section i.e.  __bpf_prog_enter{_sleepable}
and __bpf_prog_exit{_sleepable}) (rcu_read_lock or rcu_read_lock_trace).

In order to make the local storage maps accessible to both sleepable
and non-sleepable programs, one needs to call both
call_rcu_tasks_trace and call_rcu to wait for both trace and classical
RCU grace periods to expire before freeing memory.

Paul's work on call_rcu_tasks_trace allows us to have per CPU queueing
for call_rcu_tasks_trace. This behaviour can be achieved by setting
rcupdate.rcu_task_enqueue_lim=<num_cpus> boot parameter.

In light of these new performance changes and to keep the local storage
code simple, avoid adding a new flag for sleepable maps / local storage
to select the RCU synchronization (trace / classical).

Also, update the dereferencing of the pointers to use
rcu_derference_check (with either the trace or normal RCU locks held)
with a common bpf_rcu_lock_held helper method.

Signed-off-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20211224152916.1550677-2-kpsingh@kernel.org
---
 include/linux/bpf_local_storage.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index a2b625960ffe..37b3906af8b1 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -17,6 +17,9 @@
 
 #define BPF_LOCAL_STORAGE_CACHE_SIZE	16
 
+#define bpf_rcu_lock_held()                                                    \
+	(rcu_read_lock_held() || rcu_read_lock_trace_held() ||                 \
+	 rcu_read_lock_bh_held())
 struct bpf_local_storage_map_bucket {
 	struct hlist_head list;
 	raw_spinlock_t lock;
@@ -162,4 +165,6 @@ struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 			 void *value, u64 map_flags);
 
+void bpf_local_storage_free_rcu(struct rcu_head *rcu);
+
 #endif /* _BPF_LOCAL_STORAGE_H */
-- 
cgit v1.2.3


From aebb51ec3db2a871d74b4afad3f9914812acf120 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 29 Dec 2021 17:27:42 -0800
Subject: bpf: Invert the dependency between bpf-netns.h and netns/bpf.h

netns/bpf.h gets included by netdevice.h (thru net_namespace.h)
which in turn gets included in a lot of places. We should keep
netns/bpf.h as light-weight as possible.

bpf-netns.h seems to contain more implementation details than
deserves to be included in a netns header. It needs to pull in
uapi/bpf.h to get various enum types.

Move enum netns_bpf_attach_type to netns/bpf.h and invert the
dependency. This makes netns/bpf.h fit the mold of a struct
definition header more clearly, and drops the number of objects
rebuilt when uapi/bpf.h is touched from 7.7k to 1.1k.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211230012742.770642-3-kuba@kernel.org
---
 include/linux/bpf-netns.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h
index 722f799c1a2e..413cfa5e4b07 100644
--- a/include/linux/bpf-netns.h
+++ b/include/linux/bpf-netns.h
@@ -3,15 +3,9 @@
 #define _BPF_NETNS_H
 
 #include <linux/mutex.h>
+#include <net/netns/bpf.h>
 #include <uapi/linux/bpf.h>
 
-enum netns_bpf_attach_type {
-	NETNS_BPF_INVALID = -1,
-	NETNS_BPF_FLOW_DISSECTOR = 0,
-	NETNS_BPF_SK_LOOKUP,
-	MAX_NETNS_BPF_ATTACH_TYPE
-};
-
 static inline enum netns_bpf_attach_type
 to_netns_bpf_attach_type(enum bpf_attach_type attach_type)
 {
-- 
cgit v1.2.3


From 730b49aac426e1e8016d3c2dd6b407e500423821 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Thu, 23 Dec 2021 11:24:22 +0300
Subject: usb: typec: port-mapper: Convert to the component framework

Instead of trying to keep track of the connections to the
USB Type-C connectors separately, letting the component
framework take care of that.

From now on every USB Type-C connector will register itself
as "aggregate" - component master - and anything that can be
connected to it inside the system can then simply register
itself as a generic component.

The matching of the components and the connector shall rely
on ACPI _PLD initially. Before registering itself as the
aggregate, the connector will find all other ACPI devices
that have matching _PLD crc hash with it (matching value in
the pld_crc member of struct acpi_device), and add a
component match entry for each one of them. Because only
ACPI is supported for now, the driver shall only be build
when ACPI is supported.

This removes the need for the custom API that the driver
exposed. The components and the connector can therefore
exist completely independently of each other. The order in
which they are registered, as well as are they modules or
not, is now irrelevant.

Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20211223082422.45637-1-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/typec.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index e2e44bb1dad8..7ba45a97eeae 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -305,16 +305,4 @@ void typec_partner_set_svdm_version(struct typec_partner *partner,
 				    enum usb_pd_svdm_ver svdm_version);
 int typec_get_negotiated_svdm_version(struct typec_port *port);
 
-#if IS_REACHABLE(CONFIG_TYPEC)
-int typec_link_port(struct device *port);
-void typec_unlink_port(struct device *port);
-#else
-static inline int typec_link_port(struct device *port)
-{
-	return 0;
-}
-
-static inline void typec_unlink_port(struct device *port) { }
-#endif
-
 #endif /* __LINUX_USB_TYPEC_H */
-- 
cgit v1.2.3


From 510a0bdb2bfcff8d7be822c72adc3add7a97d559 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Thu, 23 Dec 2021 11:24:32 +0300
Subject: usb: Remove usb_for_each_port()

There are no more users for the function.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20211223082432.45653-1-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 7ccaa76a9a96..200b7b79acb5 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -875,15 +875,6 @@ extern struct usb_host_interface *usb_find_alt_setting(
 		unsigned int iface_num,
 		unsigned int alt_num);
 
-#if IS_REACHABLE(CONFIG_USB)
-int usb_for_each_port(void *data, int (*fn)(struct device *, void *));
-#else
-static inline int usb_for_each_port(void *data, int (*fn)(struct device *, void *))
-{
-	return 0;
-}
-#endif
-
 /* port claiming functions */
 int usb_hub_claim_port(struct usb_device *hdev, unsigned port1,
 		struct usb_dev_state *owner);
-- 
cgit v1.2.3


From b4a29b94804c4774f22555651296b838df6ec0e4 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Tue, 28 Dec 2021 18:22:00 +0100
Subject: serial: 8250: Move Alpha-specific quirk out of the core

struct uart_8250_port contains mcr_mask and mcr_force members whose
sole purpose is to work around an Alpha-specific quirk.  This code
doesn't belong in the core where it is executed by everyone else,
so move it to a proper ->set_mctrl callback which is used on the
affected Alpha machine only.

The quirk was introduced in January 1995:
https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/diff/drivers/char/serial.c?h=1.1.83

The members in struct uart_8250_port were added in 2002:
https://git.kernel.org/history/history/c/4524aad27854

The quirk applies to non-PCI Alphas and arch/alpha/Kconfig specifies
"select FORCE_PCI if !ALPHA_JENSEN".  So apparently the only affected
machine is the EISA-based Jensen that Linus was working on back then:
https://lore.kernel.org/all/CAHk-=wj1JWZ3sCrGz16nxEj7=0O+srMg6Ah3iPTDXSPKEws_SA@mail.gmail.com/

Up until now the quirk is not applied unless CONFIG_PCI is disabled.
If users forget to do that or run a generic Alpha kernel, the serial
ports aren't usable on Jensen.  Avoid by confining the quirk to
CONFIG_ALPHA_JENSEN instead of !CONFIG_PCI.  On generic Alpha kernels,
auto-detect at runtime whether the quirk needs to be applied.

Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Ulrich Teichert <krypton@ulrich-teichert.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Link: https://lore.kernel.org/r/b83d069cb516549b8a5420e097bb6bdd806f36fc.1640695609.git.lukas@wunner.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_8250.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 5db211f43b29..ff84a3ed10ea 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -104,8 +104,6 @@ struct uart_8250_port {
 	unsigned char		ier;
 	unsigned char		lcr;
 	unsigned char		mcr;
-	unsigned char		mcr_mask;	/* mask of user bits */
-	unsigned char		mcr_force;	/* mask of forced bits */
 	unsigned char		cur_iotype;	/* Running I/O type */
 	unsigned int		rpm_tx_active;
 	unsigned char		canary;		/* non-zero during system sleep
-- 
cgit v1.2.3


From d8e9a406a931f687945703a4bac45042eb81ce92 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm+renesas@opensource.se>
Date: Sun, 12 Dec 2021 22:21:28 +0900
Subject: serdev: BREAK/FRAME/PARITY/OVERRUN notification prototype V2

Allow serdev device drivers get notified by hardware errors such as BREAK,
FRAME, PARITY and OVERRUN.

With this patch, in the event of an error detected in the UART device driver
the serdev_device_driver will get the newly introduced ->error() callback
invoked if serdev_device_set_error_mask() has previously been used to enable
the type of error. The errors are taken straight from the TTY layer and fed
into the serdev_device_driver after filtering out only enabled errors.

Without this patch the hardware errors never reach the serdev_device_driver.

Signed-off-by: Magnus Damm <damm+renesas@opensource.se>
Link: https://lore.kernel.org/r/163931528842.27756.3665040315954968747.sendpatchset@octo
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serdev.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 3368c261ab62..0d0b22fc7e37 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -19,12 +19,15 @@ struct serdev_device;
 
 /**
  * struct serdev_device_ops - Callback operations for a serdev device
+ * @error:		Function called with errors received from device;
+ *			may sleep.
  * @receive_buf:	Function called with data received from device;
  *			returns number of bytes accepted; may sleep.
  * @write_wakeup:	Function called when ready to transmit more data; must
  *			not sleep.
  */
 struct serdev_device_ops {
+	void (*error)(struct serdev_device *, unsigned long);
 	int (*receive_buf)(struct serdev_device *, const unsigned char *, size_t);
 	void (*write_wakeup)(struct serdev_device *);
 };
@@ -76,6 +79,11 @@ enum serdev_parity {
 	SERDEV_PARITY_ODD,
 };
 
+#define SERDEV_ERROR_BREAK 0
+#define SERDEV_ERROR_FRAME 1
+#define SERDEV_ERROR_PARITY 2
+#define SERDEV_ERROR_OVERRUN 3
+
 /*
  * serdev controller structures
  */
@@ -85,6 +93,7 @@ struct serdev_controller_ops {
 	int (*write_room)(struct serdev_controller *);
 	int (*open)(struct serdev_controller *);
 	void (*close)(struct serdev_controller *);
+	void (*set_error_mask)(struct serdev_controller *, unsigned long);
 	void (*set_flow_control)(struct serdev_controller *, bool);
 	int (*set_parity)(struct serdev_controller *, enum serdev_parity);
 	unsigned int (*set_baudrate)(struct serdev_controller *, unsigned int);
@@ -190,12 +199,24 @@ static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl,
 	return serdev->ops->receive_buf(serdev, data, count);
 }
 
+static inline void serdev_controller_error(struct serdev_controller *ctrl,
+					   unsigned long errors)
+{
+	struct serdev_device *serdev = ctrl->serdev;
+
+	if (!serdev || !serdev->ops->error)
+		return;
+
+	serdev->ops->error(serdev, errors);
+}
+
 #if IS_ENABLED(CONFIG_SERIAL_DEV_BUS)
 
 int serdev_device_open(struct serdev_device *);
 void serdev_device_close(struct serdev_device *);
 int devm_serdev_device_open(struct device *, struct serdev_device *);
 unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int);
+void serdev_device_set_error_mask(struct serdev_device *, unsigned long);
 void serdev_device_set_flow_control(struct serdev_device *, bool);
 int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t);
 void serdev_device_wait_until_sent(struct serdev_device *, long);
@@ -238,6 +259,7 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev
 {
 	return 0;
 }
+static inline void serdev_device_set_error_mask(struct serdev_device *sdev, unsigned long mask) {}
 static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {}
 static inline int serdev_device_write_buf(struct serdev_device *serdev,
 					  const unsigned char *buf,
-- 
cgit v1.2.3


From 5207fb2f311b0c45a9abfa1c84b7a7b657ffa550 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Thu, 30 Dec 2021 16:02:41 +0100
Subject: counter: Provide a wrapper to access device private data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For now this just wraps accessing struct counter_device::priv. However
this is about to change and converting drivers to this helper
individually makes fixing device lifetime issues result in easier to
review patches.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20211230150300.72196-5-u.kleine-koenig@pengutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/counter.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index dfbde2808998..627f1757f6bb 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -329,6 +329,8 @@ struct counter_device {
 	struct mutex ops_exist_lock;
 };
 
+void *counter_priv(const struct counter_device *const counter);
+
 int counter_register(struct counter_device *const counter);
 void counter_unregister(struct counter_device *const counter);
 int devm_counter_register(struct device *dev,
-- 
cgit v1.2.3


From c18e2760308e30f007fa24b558b87c39d7e86ff1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Thu, 30 Dec 2021 16:02:50 +0100
Subject: counter: Provide alternative counter registration functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current implementation gets device lifetime tracking wrong. The
problem is that allocation of struct counter_device is controlled by the
individual drivers but this structure contains a struct device that
might have to live longer than a driver is bound. As a result a command
sequence like:

	{ sleep 5; echo bang; } > /dev/counter0 &
	sleep 1;
	echo 40000000.timer:counter > /sys/bus/platform/drivers/stm32-timer-counter/unbind

can keep a reference to the struct device and unbinding results in
freeing the memory occupied by this device resulting in an oops.

This commit provides two new functions (plus some helpers):
 - counter_alloc() to allocate a struct counter_device that is
   automatically freed once the embedded struct device is released
 - counter_add() to register such a device.

Note that this commit doesn't fix any issues, all drivers have to be
converted to these new functions to correct the lifetime problems.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20211230150300.72196-14-u.kleine-koenig@pengutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/counter.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index 627f1757f6bb..ed8d5820f0d1 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -327,14 +327,29 @@ struct counter_device {
 	spinlock_t events_in_lock;
 	struct mutex events_out_lock;
 	struct mutex ops_exist_lock;
+
+	/*
+	 * This can go away once all drivers are converted to
+	 * counter_alloc()/counter_add().
+	 */
+	bool legacy_device;
 };
 
 void *counter_priv(const struct counter_device *const counter);
 
 int counter_register(struct counter_device *const counter);
+
+struct counter_device *counter_alloc(size_t sizeof_priv);
+void counter_put(struct counter_device *const counter);
+int counter_add(struct counter_device *const counter);
+
 void counter_unregister(struct counter_device *const counter);
 int devm_counter_register(struct device *dev,
 			  struct counter_device *const counter);
+struct counter_device *devm_counter_alloc(struct device *dev,
+					  size_t sizeof_priv);
+int devm_counter_add(struct device *dev,
+		     struct counter_device *const counter);
 void counter_push_event(struct counter_device *const counter, const u8 event,
 			const u8 channel);
 
-- 
cgit v1.2.3


From f2ee4759fb700b32a1bd830960fe86bf6bdfd0ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Thu, 30 Dec 2021 16:03:00 +0100
Subject: counter: remove old and now unused registration API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Usage of counter_register() yields issues in device lifetime tracking. All
drivers were converted to the new API, so the old one can go away.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20211230150300.72196-24-u.kleine-koenig@pengutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/counter.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index ed8d5820f0d1..1fe17f5adb09 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -314,8 +314,6 @@ struct counter_device {
 	struct counter_comp *ext;
 	size_t num_ext;
 
-	void *priv;
-
 	struct device dev;
 	struct cdev chrdev;
 	struct list_head events_list;
@@ -327,25 +325,15 @@ struct counter_device {
 	spinlock_t events_in_lock;
 	struct mutex events_out_lock;
 	struct mutex ops_exist_lock;
-
-	/*
-	 * This can go away once all drivers are converted to
-	 * counter_alloc()/counter_add().
-	 */
-	bool legacy_device;
 };
 
 void *counter_priv(const struct counter_device *const counter);
 
-int counter_register(struct counter_device *const counter);
-
 struct counter_device *counter_alloc(size_t sizeof_priv);
 void counter_put(struct counter_device *const counter);
 int counter_add(struct counter_device *const counter);
 
 void counter_unregister(struct counter_device *const counter);
-int devm_counter_register(struct device *dev,
-			  struct counter_device *const counter);
 struct counter_device *devm_counter_alloc(struct device *dev,
 					  size_t sizeof_priv);
 int devm_counter_add(struct device *dev,
-- 
cgit v1.2.3


From d6c6d0bb2cb3af91c9c1546af7cdf4770d0df443 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Wed, 29 Dec 2021 12:36:20 +0100
Subject: net: remove references to CONFIG_IRDA in network header files

Commit d64c2a76123f ("staging: irda: remove the irda network stack and
drivers") removes the config IRDA.

Remove the remaining references to this non-existing config in the network
header files.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Link: https://lore.kernel.org/r/20211229113620.19368-1-lukas.bulwahn@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/atalk.h     | 2 +-
 include/linux/netdevice.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/atalk.h b/include/linux/atalk.h
index f6034ba774be..a55bfc6567d0 100644
--- a/include/linux/atalk.h
+++ b/include/linux/atalk.h
@@ -113,7 +113,7 @@ extern int aarp_proto_init(void);
 /* Inter module exports */
 
 /* Give a device find its atif control structure */
-#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK)
+#if IS_ENABLED(CONFIG_ATALK)
 static inline struct atalk_iface *atalk_find_dev(struct net_device *dev)
 {
 	return dev->atalk_ptr;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2684bdb6defa..6f99c8f51b60 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2098,7 +2098,7 @@ struct net_device {
 #if IS_ENABLED(CONFIG_TIPC)
 	struct tipc_bearer __rcu *tipc_ptr;
 #endif
-#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK)
+#if IS_ENABLED(CONFIG_ATALK)
 	void 			*atalk_ptr;
 #endif
 	struct in_device __rcu	*ip_ptr;
-- 
cgit v1.2.3


From c3fb0e280b4cdbf382f59eb6b276e4c6047bc219 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Thu, 18 Nov 2021 02:32:37 +0200
Subject: net/mlx5: DR, Fix lower case macro prefix "mlx5_" to "MLX5_"

Macros prefix should be capital letters - fix the prefix in
mlx5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED.

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e9db12aae8f9..18b816b41545 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1291,7 +1291,7 @@ enum {
 enum {
 	MLX5_FLEX_PARSER_GENEVE_ENABLED		= 1 << 3,
 	MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED	= 1 << 4,
-	mlx5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED	= 1 << 5,
+	MLX5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED	= 1 << 5,
 	MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED	= 1 << 7,
 	MLX5_FLEX_PARSER_ICMP_V4_ENABLED	= 1 << 8,
 	MLX5_FLEX_PARSER_ICMP_V6_ENABLED	= 1 << 9,
-- 
cgit v1.2.3


From 0f2a6c3b9219bdf497750258cd2ad513f0056b42 Mon Sep 17 00:00:00 2001
From: Muhammad Sammar <muhammads@nvidia.com>
Date: Sun, 5 Sep 2021 15:16:21 +0300
Subject: net/mlx5: Add misc5 flow table match parameters

Add support for misc5 match parameter as per HW spec, this will allow
matching on tunnel_header fields.

Signed-off-by: Muhammad Sammar <muhammads@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
---
 include/linux/mlx5/device.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 25 ++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 9c25edfd59a6..604b85dd770a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1117,6 +1117,7 @@ enum {
 	MLX5_MATCH_MISC_PARAMETERS_2	= 1 << 3,
 	MLX5_MATCH_MISC_PARAMETERS_3	= 1 << 4,
 	MLX5_MATCH_MISC_PARAMETERS_4	= 1 << 5,
+	MLX5_MATCH_MISC_PARAMETERS_5	= 1 << 6,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 18b816b41545..c74c5e147cb9 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -670,6 +670,26 @@ struct mlx5_ifc_fte_match_set_misc4_bits {
 	u8         reserved_at_100[0x100];
 };
 
+struct mlx5_ifc_fte_match_set_misc5_bits {
+	u8         macsec_tag_0[0x20];
+
+	u8         macsec_tag_1[0x20];
+
+	u8         macsec_tag_2[0x20];
+
+	u8         macsec_tag_3[0x20];
+
+	u8         tunnel_header_0[0x20];
+
+	u8         tunnel_header_1[0x20];
+
+	u8         tunnel_header_2[0x20];
+
+	u8         tunnel_header_3[0x20];
+
+	u8         reserved_at_100[0x100];
+};
+
 struct mlx5_ifc_cmd_pas_bits {
 	u8         pa_h[0x20];
 
@@ -1839,7 +1859,9 @@ struct mlx5_ifc_fte_match_param_bits {
 
 	struct mlx5_ifc_fte_match_set_misc4_bits misc_parameters_4;
 
-	u8         reserved_at_c00[0x400];
+	struct mlx5_ifc_fte_match_set_misc5_bits misc_parameters_5;
+
+	u8         reserved_at_e00[0x200];
 };
 
 enum {
@@ -5977,6 +5999,7 @@ enum {
 	MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0x3,
 	MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_3 = 0x4,
 	MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_4 = 0x5,
+	MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_5 = 0x6,
 };
 
 struct mlx5_ifc_query_flow_group_out_bits {
-- 
cgit v1.2.3


From f59464e257bdbd4df6df9a4505d7858a0baf6cf7 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Mon, 8 Nov 2021 02:42:50 +0200
Subject: net/mlx5: DR, Add support for matching on geneve_tlv_option_0_exist
 field

Match on geneve_tlv_option_0_exist field on devices that support STEv1.

Signed-off-by: Muhammad Sammar <muhammads@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c74c5e147cb9..deaa0f71213f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -372,7 +372,8 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         reserved_at_37[0x9];
 
 	u8         geneve_tlv_option_0_data[0x1];
-	u8         reserved_at_41[0x4];
+	u8         geneve_tlv_option_0_exist[0x1];
+	u8         reserved_at_42[0x3];
 	u8         outer_first_mpls_over_udp[0x4];
 	u8         outer_first_mpls_over_gre[0x4];
 	u8         inner_first_mpls[0x4];
@@ -551,7 +552,8 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 	u8         bth_opcode[0x8];
 
 	u8         geneve_vni[0x18];
-	u8         reserved_at_d8[0x7];
+	u8         reserved_at_d8[0x6];
+	u8         geneve_tlv_option_0_exist[0x1];
 	u8         geneve_oam[0x1];
 
 	u8         reserved_at_e0[0xc];
-- 
cgit v1.2.3


From 4ff725e1d4adfd313bc9767523fc8d6e90d50f9c Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 23 Nov 2021 02:11:12 +0200
Subject: net/mlx5: DR, Ignore modify TTL if device doesn't support it

When modifying TTL, packet's csum has to be recalculated.
Due to HW issue in ConnectX-5, csum recalculation for modify TTL
is supported through a work-around that is specifically enabled
by configuration.
If the work-around isn't enabled, ignore the modify TTL action
rather than adding an unsupported action.

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index deaa0f71213f..598ac3bcc901 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -833,7 +833,7 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits {
 	u8      fdb_to_vport_reg_c_id[0x8];
 	u8      reserved_at_8[0xd];
 	u8      fdb_modify_header_fwd_to_table[0x1];
-	u8      reserved_at_16[0x1];
+	u8      fdb_ipv4_ttl_modify[0x1];
 	u8      flow_source[0x1];
 	u8      reserved_at_18[0x2];
 	u8      multi_fdb_encap[0x1];
-- 
cgit v1.2.3


From 99a507a8ea28542ec196e2dd80096708e2482735 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 31 Dec 2021 13:42:30 +0100
Subject: Revert "serdev: BREAK/FRAME/PARITY/OVERRUN notification prototype V2"

This reverts commit d8e9a406a931f687945703a4bac45042eb81ce92.

It needs some future changes as pointed out by Johan and is not ready to
be merged just yet.

Reported-by: Johan Hovold <johan@kernel.org>
Cc: Magnus Damm <damm+renesas@opensource.se>
Link: https://lore.kernel.org/r/Yc7oZ/1tu95Z4wPS@hovoldconsulting.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serdev.h | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 0d0b22fc7e37..3368c261ab62 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -19,15 +19,12 @@ struct serdev_device;
 
 /**
  * struct serdev_device_ops - Callback operations for a serdev device
- * @error:		Function called with errors received from device;
- *			may sleep.
  * @receive_buf:	Function called with data received from device;
  *			returns number of bytes accepted; may sleep.
  * @write_wakeup:	Function called when ready to transmit more data; must
  *			not sleep.
  */
 struct serdev_device_ops {
-	void (*error)(struct serdev_device *, unsigned long);
 	int (*receive_buf)(struct serdev_device *, const unsigned char *, size_t);
 	void (*write_wakeup)(struct serdev_device *);
 };
@@ -79,11 +76,6 @@ enum serdev_parity {
 	SERDEV_PARITY_ODD,
 };
 
-#define SERDEV_ERROR_BREAK 0
-#define SERDEV_ERROR_FRAME 1
-#define SERDEV_ERROR_PARITY 2
-#define SERDEV_ERROR_OVERRUN 3
-
 /*
  * serdev controller structures
  */
@@ -93,7 +85,6 @@ struct serdev_controller_ops {
 	int (*write_room)(struct serdev_controller *);
 	int (*open)(struct serdev_controller *);
 	void (*close)(struct serdev_controller *);
-	void (*set_error_mask)(struct serdev_controller *, unsigned long);
 	void (*set_flow_control)(struct serdev_controller *, bool);
 	int (*set_parity)(struct serdev_controller *, enum serdev_parity);
 	unsigned int (*set_baudrate)(struct serdev_controller *, unsigned int);
@@ -199,24 +190,12 @@ static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl,
 	return serdev->ops->receive_buf(serdev, data, count);
 }
 
-static inline void serdev_controller_error(struct serdev_controller *ctrl,
-					   unsigned long errors)
-{
-	struct serdev_device *serdev = ctrl->serdev;
-
-	if (!serdev || !serdev->ops->error)
-		return;
-
-	serdev->ops->error(serdev, errors);
-}
-
 #if IS_ENABLED(CONFIG_SERIAL_DEV_BUS)
 
 int serdev_device_open(struct serdev_device *);
 void serdev_device_close(struct serdev_device *);
 int devm_serdev_device_open(struct device *, struct serdev_device *);
 unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int);
-void serdev_device_set_error_mask(struct serdev_device *, unsigned long);
 void serdev_device_set_flow_control(struct serdev_device *, bool);
 int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t);
 void serdev_device_wait_until_sent(struct serdev_device *, long);
@@ -259,7 +238,6 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev
 {
 	return 0;
 }
-static inline void serdev_device_set_error_mask(struct serdev_device *sdev, unsigned long mask) {}
 static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {}
 static inline int serdev_device_write_buf(struct serdev_device *serdev,
 					  const unsigned char *buf,
-- 
cgit v1.2.3


From 9a45ac2320d0a6ae01880a30d4b86025fce4061b Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Wed, 22 Dec 2021 22:41:18 -0500
Subject: fbdev: fbmem: add a helper to determine if an aperture is used by a
 fw fb

Add a function for drivers to check if the a firmware initialized
fb is corresponds to their aperture.  This allows drivers to check if the
device corresponds to what the firmware set up as the display device.

Bug: https://bugzilla.kernel.org/show_bug.cgi?id=215203
Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1840
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/linux/fb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 6f3db99ab990..3da95842b207 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -610,6 +610,7 @@ extern int remove_conflicting_pci_framebuffers(struct pci_dev *pdev,
 					       const char *name);
 extern int remove_conflicting_framebuffers(struct apertures_struct *a,
 					   const char *name, bool primary);
+extern bool is_firmware_framebuffer(struct apertures_struct *a);
 extern int fb_prepare_logo(struct fb_info *fb_info, int rotate);
 extern int fb_show_logo(struct fb_info *fb_info, int rotate);
 extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size);
-- 
cgit v1.2.3


From 1b4e3f26f9f7553b260b8aed43967500961448a6 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 2 Dec 2021 15:06:14 +0000
Subject: mm: vmscan: Reduce throttling due to a failure to make progress

Mike Galbraith, Alexey Avramov and Darrick Wong all reported similar
problems due to reclaim throttling for excessive lengths of time.  In
Alexey's case, a memory hog that should go OOM quickly stalls for
several minutes before stalling.  In Mike and Darrick's cases, a small
memcg environment stalled excessively even though the system had enough
memory overall.

Commit 69392a403f49 ("mm/vmscan: throttle reclaim when no progress is
being made") introduced the problem although commit a19594ca4a8b
("mm/vmscan: increase the timeout if page reclaim is not making
progress") made it worse.  Systems at or near an OOM state that cannot
be recovered must reach OOM quickly and memcg should kill tasks if a
memcg is near OOM.

To address this, only stall for the first zone in the zonelist, reduce
the timeout to 1 tick for VMSCAN_THROTTLE_NOPROGRESS and only stall if
the scan control nr_reclaimed is 0, kswapd is still active and there
were excessive pages pending for writeback.  If kswapd has stopped
reclaiming due to excessive failures, do not stall at all so that OOM
triggers relatively quickly.  Similarly, if an LRU is simply congested,
only lightly throttle similar to NOPROGRESS.

Alexey's original case was the most straight forward

	for i in {1..3}; do tail /dev/zero; done

On vanilla 5.16-rc1, this test stalled heavily, after the patch the test
completes in a few seconds similar to 5.15.

Alexey's second test case added watching a youtube video while tail runs
10 times.  On 5.15, playback only jitters slightly, 5.16-rc1 stalls a
lot with lots of frames missing and numerous audio glitches.  With this
patch applies, the video plays similarly to 5.15.

[lkp@intel.com: Fix W=1 build warning]

Link: https://lore.kernel.org/r/99e779783d6c7fce96448a3402061b9dc1b3b602.camel@gmx.de
Link: https://lore.kernel.org/r/20211124011954.7cab9bb4@mail.inbox.lv
Link: https://lore.kernel.org/r/20211022144651.19914-1-mgorman@techsingularity.net
Link: https://lore.kernel.org/r/20211202150614.22440-1-mgorman@techsingularity.net
Link: https://linux-regtracking.leemhuis.info/regzbot/regression/20211124011954.7cab9bb4@mail.inbox.lv/
Reported-and-tested-by: Alexey Avramov <hakavlad@inbox.lv>
Reported-and-tested-by: Mike Galbraith <efault@gmx.de>
Reported-and-tested-by: Darrick J. Wong <djwong@kernel.org>
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Hugh Dickins <hughd@google.com>
Tracked-by: Thorsten Leemhuis <regressions@leemhuis.info>
Fixes: 69392a403f49 ("mm/vmscan: throttle reclaim when no progress is being made")
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 58e744b78c2c..936dc0b6c226 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -277,6 +277,7 @@ enum vmscan_throttle_state {
 	VMSCAN_THROTTLE_WRITEBACK,
 	VMSCAN_THROTTLE_ISOLATED,
 	VMSCAN_THROTTLE_NOPROGRESS,
+	VMSCAN_THROTTLE_CONGESTED,
 	NR_VMSCAN_THROTTLE,
 };
 
-- 
cgit v1.2.3


From e7026f15564fbe0c8b091f218203111f77b84eda Mon Sep 17 00:00:00 2001
From: Colin Foster <colin.foster@in-advantage.com>
Date: Tue, 28 Dec 2021 21:03:06 -0800
Subject: net: phy: lynx: refactor Lynx PCS module to use generic phylink_pcs

Remove references to lynx_pcs structures so drivers like the Felix DSA
can reference alternate PCS drivers.

Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pcs-lynx.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pcs-lynx.h b/include/linux/pcs-lynx.h
index a6440d6ebe95..5712cc2ce775 100644
--- a/include/linux/pcs-lynx.h
+++ b/include/linux/pcs-lynx.h
@@ -9,13 +9,10 @@
 #include <linux/mdio.h>
 #include <linux/phylink.h>
 
-struct lynx_pcs {
-	struct phylink_pcs pcs;
-	struct mdio_device *mdio;
-};
+struct mdio_device *lynx_get_mdio_device(struct phylink_pcs *pcs);
 
-struct lynx_pcs *lynx_pcs_create(struct mdio_device *mdio);
+struct phylink_pcs *lynx_pcs_create(struct mdio_device *mdio);
 
-void lynx_pcs_destroy(struct lynx_pcs *pcs);
+void lynx_pcs_destroy(struct phylink_pcs *pcs);
 
 #endif /* __LINUX_PCS_LYNX_H */
-- 
cgit v1.2.3


From ece014141cd4b49f2d763f28b19e417b84460560 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 3 May 2021 07:29:47 -0400
Subject: mm/doc: Add documentation for folio_test_uptodate

Move the PG_uptodate documentation to be documentation for
folio_test_uptodate() and expand on it a little.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/page-flags.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index b5f14d581113..b3d353d537e2 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -68,9 +68,6 @@
  * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
  * a result of MADV_FREE).
  *
- * PG_uptodate tells whether the page's contents is valid.  When a read
- * completes, the page becomes uptodate, unless a disk I/O error happened.
- *
  * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
  * file-backed pagecache (see mm/vmscan.c).
  *
@@ -615,6 +612,16 @@ TESTPAGEFLAG_FALSE(Ksm, ksm)
 
 u64 stable_page_flags(struct page *page);
 
+/**
+ * folio_test_uptodate - Is this folio up to date?
+ * @folio: The folio.
+ *
+ * The uptodate flag is set on a folio when every byte in the folio is
+ * at least as new as the corresponding bytes on storage.  Anonymous
+ * and CoW folios are always uptodate.  If the folio is not uptodate,
+ * some of the bytes in it may be; see the is_partially_uptodate()
+ * address_space operation.
+ */
 static inline bool folio_test_uptodate(struct folio *folio)
 {
 	bool ret = test_bit(PG_uptodate, folio_flags(folio, 0));
-- 
cgit v1.2.3


From 10331795fb7991a39ebd0330fdb074cbd81fef48 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 6 Dec 2021 15:24:51 -0500
Subject: pagevec: Add folio_batch

The folio_batch is the same as the pagevec, except that it is typed
to contain folios and not pages.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/pagevec.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 7f3f19065a9f..c3fa616d7ae7 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -15,8 +15,10 @@
 #define PAGEVEC_SIZE	15
 
 struct page;
+struct folio;
 struct address_space;
 
+/* Layout must match folio_batch */
 struct pagevec {
 	unsigned char nr;
 	bool percpu_pvec_drained;
@@ -81,4 +83,71 @@ static inline void pagevec_release(struct pagevec *pvec)
 		__pagevec_release(pvec);
 }
 
+/**
+ * struct folio_batch - A collection of folios.
+ *
+ * The folio_batch is used to amortise the cost of retrieving and
+ * operating on a set of folios.  The order of folios in the batch may be
+ * significant (eg delete_from_page_cache_batch()).  Some users of the
+ * folio_batch store "exceptional" entries in it which can be removed
+ * by calling folio_batch_remove_exceptionals().
+ */
+struct folio_batch {
+	unsigned char nr;
+	bool percpu_pvec_drained;
+	struct folio *folios[PAGEVEC_SIZE];
+};
+
+/* Layout must match pagevec */
+static_assert(sizeof(struct pagevec) == sizeof(struct folio_batch));
+static_assert(offsetof(struct pagevec, pages) ==
+		offsetof(struct folio_batch, folios));
+
+/**
+ * folio_batch_init() - Initialise a batch of folios
+ * @fbatch: The folio batch.
+ *
+ * A freshly initialised folio_batch contains zero folios.
+ */
+static inline void folio_batch_init(struct folio_batch *fbatch)
+{
+	fbatch->nr = 0;
+}
+
+static inline unsigned int folio_batch_count(struct folio_batch *fbatch)
+{
+	return fbatch->nr;
+}
+
+static inline unsigned int fbatch_space(struct folio_batch *fbatch)
+{
+	return PAGEVEC_SIZE - fbatch->nr;
+}
+
+/**
+ * folio_batch_add() - Add a folio to a batch.
+ * @fbatch: The folio batch.
+ * @folio: The folio to add.
+ *
+ * The folio is added to the end of the batch.
+ * The batch must have previously been initialised using folio_batch_init().
+ *
+ * Return: The number of slots still available.
+ */
+static inline unsigned folio_batch_add(struct folio_batch *fbatch,
+		struct folio *folio)
+{
+	fbatch->folios[fbatch->nr++] = folio;
+	return fbatch_space(fbatch);
+}
+
+static inline void folio_batch_release(struct folio_batch *fbatch)
+{
+	pagevec_release((struct pagevec *)fbatch);
+}
+
+static inline void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
+{
+	pagevec_remove_exceptionals((struct pagevec *)fbatch);
+}
 #endif /* _LINUX_PAGEVEC_H */
-- 
cgit v1.2.3


From e66d70c034dbdfe1a48863f0865ac86aaf2fef1a Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Mon, 13 Dec 2021 10:51:41 +0530
Subject: dmaengine: xilinx_dpdma: use correct SDPX tag for header file

Commit 188c310bdd5d ("dmaengine: xilinx_dpdma: stop using slave_id
field") add the header file with incorrect format for SPDX tag, fix that

WARNING: Improper SPDX comment style for 'include/linux/dma/xilinx_dpdma.h', please use '/*' instead
#1: FILE: include/linux/dma/xilinx_dpdma.h:1:
+// SPDX-License-Identifier: GPL-2.0

WARNING: Missing or malformed SPDX-License-Identifier tag in line 1
#1: FILE: include/linux/dma/xilinx_dpdma.h:1:
+// SPDX-License-Identifier: GPL-2.0

Fixes: 188c310bdd5d ("dmaengine: xilinx_dpdma: stop using slave_id field")
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20211213052141.850807-1-vkoul@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/xilinx_dpdma.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma/xilinx_dpdma.h b/include/linux/dma/xilinx_dpdma.h
index 83a1377f03f8..02a4adf8921b 100644
--- a/include/linux/dma/xilinx_dpdma.h
+++ b/include/linux/dma/xilinx_dpdma.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __LINUX_DMA_XILINX_DPDMA_H
 #define __LINUX_DMA_XILINX_DPDMA_H
 
-- 
cgit v1.2.3


From 25fd330370ac40653671f323acc7fb6db27ef6fe Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 15 Dec 2021 02:01:18 +0100
Subject: power: supply_core: Pass pointer to battery info

The function to retrieve battery info (from the device tree) assumes
we have a static info struct that gets populated by calling into
power_supply_get_battery_info().

This is awkward since I want to support tables of static battery
info by just assigning a pointer to all info based on e.g. a
compatible value in the device tree.

We also have a mixture of static and dynamically allocated
variables here.

Bite the bullet and let power_supply_get_battery_info() allocate
also the memory used for the very top level
struct power_supply_battery_info container. Pass pointers
around and lifecycle this with the psy device just like the
stuff we allocate inside it.

Change all current users over.

As part of the change, initializers need to be added to some
previously uninitialized fields in struct
power_supply_battery_info.

Reviewed-By: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power_supply.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index f6e94eae4f28..86b4d5c4dab9 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -575,7 +575,7 @@ devm_power_supply_get_by_phandle(struct device *dev, const char *property)
 #endif /* CONFIG_OF */
 
 extern int power_supply_get_battery_info(struct power_supply *psy,
-					 struct power_supply_battery_info *info);
+					 struct power_supply_battery_info **info_out);
 extern void power_supply_put_battery_info(struct power_supply *psy,
 					  struct power_supply_battery_info *info);
 extern int power_supply_ocv2cap_simple(struct power_supply_battery_ocv_table *table,
-- 
cgit v1.2.3


From d5dbcca70182501bed99de85c224cef04c38ed92 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 3 Jan 2022 17:24:08 +0100
Subject: pktcdvd: convert to use attribute groups

There is no need to create kobject children of the pktcdvd device just
to display a subdirectory name.  Instead, use a named attribute group
which removes the extra kobjects and also fixes the userspace race where
the device is created yet tools like libudev can not see the attributes
as they think the subdirectories are some other sort of device.

Cc: linux-block@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220103162408.742003-1-gregkh@linuxfoundation.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/pktcdvd.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index c391e694aa26..f9c5ac80d59b 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -152,14 +152,6 @@ struct packet_stacked_data
 };
 #define PSD_POOL_SIZE		64
 
-struct pktcdvd_kobj
-{
-	struct kobject		kobj;
-	struct pktcdvd_device	*pd;
-};
-#define to_pktcdvdkobj(_k) \
-  ((struct pktcdvd_kobj*)container_of(_k,struct pktcdvd_kobj,kobj))
-
 struct pktcdvd_device
 {
 	struct block_device	*bdev;		/* dev attached */
@@ -197,8 +189,6 @@ struct pktcdvd_device
 	int			write_congestion_on;
 
 	struct device		*dev;		/* sysfs pktcdvd[0-7] dev */
-	struct pktcdvd_kobj	*kobj_stat;	/* sysfs pktcdvd[0-7]/stat/     */
-	struct pktcdvd_kobj	*kobj_wqueue;	/* sysfs pktcdvd[0-7]/write_queue/ */
 
 	struct dentry		*dfs_d_root;	/* debugfs: devname directory */
 	struct dentry		*dfs_f_info;	/* debugfs: info file */
-- 
cgit v1.2.3


From 6c952a0dc9c3ced98c4c8aa7cd11c25c59157f1f Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 21 Dec 2021 08:20:26 +0100
Subject: ata: libata: Add ata_port_classify() helper

Add an ata_port_classify() helper to print out the results from
the device classification and remove the debugging statements
from ata_dev_classify().

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2a8404b26083..235fdbeb19ea 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1160,6 +1160,8 @@ extern enum ata_completion_errors ata_noop_qc_prep(struct ata_queued_cmd *qc);
 extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
 		 unsigned int n_elem);
 extern unsigned int ata_dev_classify(const struct ata_taskfile *tf);
+extern unsigned int ata_port_classify(struct ata_port *ap,
+				      const struct ata_taskfile *tf);
 extern void ata_dev_disable(struct ata_device *adev);
 extern void ata_id_string(const u16 *id, unsigned char *s,
 			  unsigned int ofs, unsigned int len);
-- 
cgit v1.2.3


From e41294408c56c68ea0f269d757527bf33b39118a Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 3 Jan 2022 18:11:31 +0100
Subject: icmp: ICMPV6: Examine invoking packet for Segment Route Headers.

RFC8754 says:

ICMP error packets generated within the SR domain are sent to source
nodes within the SR domain.  The invoking packet in the ICMP error
message may contain an SRH.  Since the destination address of a packet
with an SRH changes as each segment is processed, it may not be the
destination used by the socket or application that generated the
invoking packet.

For the source of an invoking packet to process the ICMP error
message, the ultimate destination address of the IPv6 header may be
required.  The following logic is used to determine the destination
address for use by protocol-error handlers.

*  Walk all extension headers of the invoking IPv6 packet to the
   routing extension header preceding the upper-layer header.

   -  If routing header is type 4 Segment Routing Header (SRH)

      o  The SID at Segment List[0] may be used as the destination
         address of the invoking packet.

Mangle the skb so the network header points to the invoking packet
inside the ICMP packet. The seg6 helpers can then be used on the skb
to find any segment routing headers. If found, mark this fact in the
IPv6 control block of the skb, and store the offset into the packet of
the SRH. Then restore the skb back to its old state.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 20c1f968da7c..a59d25f19385 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -133,6 +133,7 @@ struct inet6_skb_parm {
 	__u16			dsthao;
 #endif
 	__u16			frag_max_size;
+	__u16			srhoff;
 
 #define IP6SKB_XFRM_TRANSFORMED	1
 #define IP6SKB_FORWARDED	2
@@ -142,6 +143,7 @@ struct inet6_skb_parm {
 #define IP6SKB_HOPBYHOP        32
 #define IP6SKB_L3SLAVE         64
 #define IP6SKB_JUMBOGRAM      128
+#define IP6SKB_SEG6	      256
 };
 
 #if defined(CONFIG_NET_L3_MASTER_DEV)
-- 
cgit v1.2.3


From f083266487690124481eac0869da850406fb3ed3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 29 Aug 2021 09:18:53 +0200
Subject: headers/uninline: Uninline single-use function:
 kobject_has_children()

This was the only usage of <linux/kref_api.h> in <linux/kobject_api.h>,
so we'll able to decouple the two after this change.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kobject.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index ad90b49824dc..c7b47399b36a 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -117,23 +117,6 @@ extern void kobject_get_ownership(struct kobject *kobj,
 				  kuid_t *uid, kgid_t *gid);
 extern char *kobject_get_path(struct kobject *kobj, gfp_t flag);
 
-/**
- * kobject_has_children - Returns whether a kobject has children.
- * @kobj: the object to test
- *
- * This will return whether a kobject has other kobjects as children.
- *
- * It does NOT account for the presence of attribute files, only sub
- * directories. It also assumes there is no concurrent addition or
- * removal of such children, and thus relies on external locking.
- */
-static inline bool kobject_has_children(struct kobject *kobj)
-{
-	WARN_ON_ONCE(kref_read(&kobj->kref) == 0);
-
-	return kobj->sd && kobj->sd->dir.subdirs;
-}
-
 struct kobj_type {
 	void (*release)(struct kobject *kobj);
 	const struct sysfs_ops *sysfs_ops;
-- 
cgit v1.2.3


From d9c19d32d86fa54934b632c4314beb067bf98378 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 18 Oct 2021 10:39:06 -0400
Subject: iov_iter: Add copy_folio_to_iter()

This wrapper around copy_page_to_iter() works because copy_page_to_iter()
handles compound pages correctly.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/uio.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 6350354f97e9..43321dbebba8 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -7,6 +7,7 @@
 
 #include <linux/kernel.h>
 #include <linux/thread_info.h>
+#include <linux/mm_types.h>
 #include <uapi/linux/uio.h>
 
 struct page;
@@ -146,6 +147,12 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
 
+static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
+		size_t bytes, struct iov_iter *i)
+{
+	return copy_page_to_iter(&folio->page, offset, bytes, i);
+}
+
 static __always_inline __must_check
 size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
-- 
cgit v1.2.3


From 5bf34d7c7ffe773c3b3c1b6ebf39e0f34a2436ec Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 28 Nov 2021 14:24:43 -0500
Subject: mm: Add folio_test_pmd_mappable()

Add a predicate to determine if the folio might be mapped by a PMD entry.
If CONFIG_TRANSPARENT_HUGEPAGE is disabled, we know it can't be, even
if it's large enough.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/huge_mm.h | 14 ++++++++++++++
 include/linux/mm.h      | 42 +++++++++++++++++++++---------------------
 2 files changed, 35 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f280f33ff223..e4c18ba8d3bf 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -274,6 +274,15 @@ static inline int thp_nr_pages(struct page *page)
 	return 1;
 }
 
+/**
+ * folio_test_pmd_mappable - Can we map this folio with a PMD?
+ * @folio: The folio to test
+ */
+static inline bool folio_test_pmd_mappable(struct folio *folio)
+{
+	return folio_order(folio) >= HPAGE_PMD_ORDER;
+}
+
 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
@@ -339,6 +348,11 @@ static inline int thp_nr_pages(struct page *page)
 	return 1;
 }
 
+static inline bool folio_test_pmd_mappable(struct folio *folio)
+{
+	return false;
+}
+
 static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 {
 	return false;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a7e4a9e7d807..72ca04f16711 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -714,6 +714,27 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
 struct mmu_gather;
 struct inode;
 
+static inline unsigned int compound_order(struct page *page)
+{
+	if (!PageHead(page))
+		return 0;
+	return page[1].compound_order;
+}
+
+/**
+ * folio_order - The allocation order of a folio.
+ * @folio: The folio.
+ *
+ * A folio is composed of 2^order pages.  See get_order() for the definition
+ * of order.
+ *
+ * Return: The order of the folio.
+ */
+static inline unsigned int folio_order(struct folio *folio)
+{
+	return compound_order(&folio->page);
+}
+
 #include <linux/huge_mm.h>
 
 /*
@@ -906,27 +927,6 @@ static inline void destroy_compound_page(struct page *page)
 	compound_page_dtors[page[1].compound_dtor](page);
 }
 
-static inline unsigned int compound_order(struct page *page)
-{
-	if (!PageHead(page))
-		return 0;
-	return page[1].compound_order;
-}
-
-/**
- * folio_order - The allocation order of a folio.
- * @folio: The folio.
- *
- * A folio is composed of 2^order pages.  See get_order() for the definition
- * of order.
- *
- * Return: The order of the folio.
- */
-static inline unsigned int folio_order(struct folio *folio)
-{
-	return compound_order(&folio->page);
-}
-
 static inline bool hpage_pincount_available(struct page *page)
 {
 	/*
-- 
cgit v1.2.3


From 9f2b04a25a41b1f41b3cead4f56854a4192ec5b0 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 16 Aug 2021 23:36:31 -0400
Subject: filemap: Add folio_put_wait_locked()

Convert all three callers of put_and_wait_on_page_locked() to
folio_put_wait_locked().  This shrinks the kernel overall by 19 bytes.
filemap_update_page() shrinks by 19 bytes while __migration_entry_wait()
is unchanged.  folio_put_wait_locked() is 14 bytes smaller than
put_and_wait_on_page_locked(), but pmd_migration_entry_wait() grows by
14 bytes.  It removes the assumption from pmd_migration_entry_wait()
that pages cannot be larger than a PMD (which is true today, but
may be interesting to explore in the future).

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/pagemap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 605246452305..841f7ba62d7d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -868,7 +868,7 @@ static inline int wait_on_page_locked_killable(struct page *page)
 	return folio_wait_locked_killable(page_folio(page));
 }
 
-int put_and_wait_on_page_locked(struct page *page, int state);
+int folio_put_wait_locked(struct folio *folio, int state);
 void wait_on_page_writeback(struct page *page);
 void folio_wait_writeback(struct folio *folio);
 int folio_wait_writeback_killable(struct folio *folio);
-- 
cgit v1.2.3


From 621db4880d305bc37b343b1671e03b7eb5d61389 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 8 May 2021 20:04:05 -0400
Subject: filemap: Add filemap_unaccount_folio()

Replace unaccount_page_cache_page() with filemap_unaccount_folio().
The bug handling path could be a bit more robust (eg taking into account
the mapcounts of tail pages), but it's really never supposed to happen.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/pagemap.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 841f7ba62d7d..077b6f378666 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -884,11 +884,6 @@ static inline void __set_page_dirty(struct page *page,
 }
 void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
 			  struct bdi_writeback *wb);
-static inline void account_page_cleaned(struct page *page,
-		struct address_space *mapping, struct bdi_writeback *wb)
-{
-	return folio_account_cleaned(page_folio(page), mapping, wb);
-}
 void __folio_cancel_dirty(struct folio *folio);
 static inline void folio_cancel_dirty(struct folio *folio)
 {
-- 
cgit v1.2.3


From 452e9e6992fe058a650c81d01a9982e3faf10278 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 9 May 2021 09:33:42 -0400
Subject: filemap: Add filemap_remove_folio and __filemap_remove_folio

Reimplement __delete_from_page_cache() as a wrapper around
__filemap_remove_folio() and delete_from_page_cache() as a wrapper
around filemap_remove_folio().  Remove the EXPORT_SYMBOL as
delete_from_page_cache() was not used by any in-tree modules.
Convert page_cache_free_page() into filemap_free_folio().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/pagemap.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 077b6f378666..3f26b191ede3 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -930,8 +930,13 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 		pgoff_t index, gfp_t gfp);
 int filemap_add_folio(struct address_space *mapping, struct folio *folio,
 		pgoff_t index, gfp_t gfp);
-extern void delete_from_page_cache(struct page *page);
-extern void __delete_from_page_cache(struct page *page, void *shadow);
+void filemap_remove_folio(struct folio *folio);
+void delete_from_page_cache(struct page *page);
+void __filemap_remove_folio(struct folio *folio, void *shadow);
+static inline void __delete_from_page_cache(struct page *page, void *shadow)
+{
+	__filemap_remove_folio(page_folio(page), shadow);
+}
 void replace_page_cache_page(struct page *old, struct page *new);
 void delete_from_page_cache_batch(struct address_space *mapping,
 				  struct pagevec *pvec);
-- 
cgit v1.2.3


From bb2e98b613a3c76c904dfa82eb4b86773817598b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 28 Nov 2021 16:14:50 -0500
Subject: filemap: Remove thp_contains()

This function is now unused, so delete it.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/pagemap.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3f26b191ede3..8c2cad7f0c36 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -512,15 +512,6 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
 			mapping_gfp_mask(mapping));
 }
 
-/* Does this page contain this index? */
-static inline bool thp_contains(struct page *head, pgoff_t index)
-{
-	/* HugeTLBfs indexes the page cache in units of hpage_size */
-	if (PageHuge(head))
-		return head->index == index;
-	return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL));
-}
-
 #define swapcache_index(folio)	__page_file_index(&(folio)->page)
 
 /**
-- 
cgit v1.2.3


From 7836d9990079ed611199819ccf487061b748193a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 27 May 2021 12:30:54 -0400
Subject: readahead: Convert page_cache_async_ra() to take a folio

Using the folio here avoids checking whether it's a tail page.
This patch mostly just enables some of the following patches.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/pagemap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8c2cad7f0c36..30302be6977f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -993,7 +993,7 @@ struct readahead_control {
 void page_cache_ra_unbounded(struct readahead_control *,
 		unsigned long nr_to_read, unsigned long lookahead_count);
 void page_cache_sync_ra(struct readahead_control *, unsigned long req_count);
-void page_cache_async_ra(struct readahead_control *, struct page *,
+void page_cache_async_ra(struct readahead_control *, struct folio *,
 		unsigned long req_count);
 void readahead_expand(struct readahead_control *ractl,
 		      loff_t new_start, size_t new_len);
@@ -1040,7 +1040,7 @@ void page_cache_async_readahead(struct address_space *mapping,
 		struct page *page, pgoff_t index, unsigned long req_count)
 {
 	DEFINE_READAHEAD(ractl, file, ra, mapping, index);
-	page_cache_async_ra(&ractl, page, req_count);
+	page_cache_async_ra(&ractl, page_folio(page), req_count);
 }
 
 static inline struct folio *__readahead_folio(struct readahead_control *ractl)
-- 
cgit v1.2.3


From 539a3322f208db478db88c4a76239476defce6b1 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 16 Dec 2020 11:45:30 -0500
Subject: filemap: Add read_cache_folio and read_mapping_folio

Reimplement read_cache_page() as a wrapper around read_cache_folio().
Saves over 400 bytes of text from do_read_cache_folio() which more
than makes up for the extra 100 bytes of text added to the various
wrapper functions.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/pagemap.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 30302be6977f..7bef50ea5435 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -629,8 +629,10 @@ static inline struct page *grab_cache_page(struct address_space *mapping,
 	return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
 }
 
-extern struct page * read_cache_page(struct address_space *mapping,
-				pgoff_t index, filler_t *filler, void *data);
+struct folio *read_cache_folio(struct address_space *, pgoff_t index,
+		filler_t *filler, void *data);
+struct page *read_cache_page(struct address_space *, pgoff_t index,
+		filler_t *filler, void *data);
 extern struct page * read_cache_page_gfp(struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 extern int read_cache_pages(struct address_space *mapping,
@@ -642,6 +644,12 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
 	return read_cache_page(mapping, index, NULL, data);
 }
 
+static inline struct folio *read_mapping_folio(struct address_space *mapping,
+				pgoff_t index, void *data)
+{
+	return read_cache_folio(mapping, index, NULL, data);
+}
+
 /*
  * Get index of the page within radix-tree (but not for hugetlb pages).
  * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
-- 
cgit v1.2.3


From 82c50f8b443359ec99348cd9b1289f55cd47779d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 28 Jul 2021 15:14:48 -0400
Subject: filemap: Add filemap_release_folio()

Reimplement try_to_release_page() as a wrapper around
filemap_release_folio().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/mm.h      | 1 -
 include/linux/pagemap.h | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 72ca04f16711..145f045b0ddc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1970,7 +1970,6 @@ int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
 			struct page **pages);
 struct page *get_dump_page(unsigned long addr);
 
-extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned int offset,
 			      unsigned int length);
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7bef50ea5435..eb6e58e106c8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -939,6 +939,8 @@ static inline void __delete_from_page_cache(struct page *page, void *shadow)
 void replace_page_cache_page(struct page *old, struct page *new);
 void delete_from_page_cache_batch(struct address_space *mapping,
 				  struct pagevec *pvec);
+int try_to_release_page(struct page *page, gfp_t gfp);
+bool filemap_release_folio(struct folio *folio, gfp_t gfp);
 loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
 		int whence);
 
-- 
cgit v1.2.3


From 77e2a04745ff8e391ad402e2d2d1157a5d3a7ebc Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Tue, 4 Jan 2022 19:51:08 +0000
Subject: ACPI: PCC: Implement OperationRegion handler for the PCC Type 3
 subtype

PCC OpRegion provides a mechanism to communicate with the platform
directly from the AML. PCCT provides the list of PCC channel available
in the platform, a subset or all of them can be used in PCC Opregion.

This patch registers the PCC OpRegion handler before ACPI tables are
loaded. This relies on the special context data passed to identify and
set up the PCC channel before the OpRegion handler is executed for the
first time.

Typical PCC Opregion declaration looks like this:

OperationRegion (PFRM, PCC, 2, 0x74)
Field (PFRM, ByteAcc, NoLock, Preserve)
{
    SIGN,   32,
    FLGS,   32,
    LEN,    32,
    CMD,    32,
    DATA,   800
}

It contains four named double words followed by 100 bytes of buffer
names DATA.

ASL can fill out the buffer something like:

    /* Create global or local buffer */
    Name (BUFF, Buffer (0x0C){})
    /* Create double word fields over the buffer */
    CreateDWordField (BUFF, 0x0, WD0)
    CreateDWordField (BUFF, 0x04, WD1)
    CreateDWordField (BUFF, 0x08, WD2)

    /* Fill the named fields */
    WD0 = 0x50434300
    SIGN = BUFF
    WD0 = 1
    FLGS = BUFF
    WD0 = 0x10
    LEN = BUFF

    /* Fill the payload in the DATA buffer */
    WD0 = 0
    WD1 = 0x08
    WD2 = 0
    DATA = BUFF

    /* Write to CMD field to trigger handler */
    WD0 = 0x4404
    CMD = BUFF

This buffer is received by acpi_pcc_opregion_space_handler. This
handler will fetch the complete buffer via internal_pcc_buffer.

The setup handler will receive the special PCC context data which will
contain the PCC channel index which used to set up the channel. The
buffer pointer and length is saved in region context which is then used
in the handler.

(kernel test robot: Build failure with CONFIG_ACPI_DEBUGGER)
Link: https://lore.kernel.org/r/202201041539.feAV0l27-lkp@intel.com
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b28f8790192a..93eaba2485e3 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1389,6 +1389,12 @@ static inline int find_acpi_cpu_cache_topology(unsigned int cpu, int level)
 }
 #endif
 
+#ifdef CONFIG_ACPI_PCC
+void acpi_init_pcc(void);
+#else
+static inline void acpi_init_pcc(void) { }
+#endif
+
 #ifdef CONFIG_ACPI
 extern void acpi_device_notify(struct device *dev);
 extern void acpi_device_notify_remove(struct device *dev);
-- 
cgit v1.2.3


From 742bef476ca5352b16063161fb73a56629a6d995 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 21 Dec 2021 08:20:35 +0100
Subject: ata: libata: move ata_{port,link,dev}_dbg to standard pr_XXX() macros

Use standard pr_{debug,info,notice,warn,err} macros instead of the
hand-crafted printk helpers.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 69 +++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 235fdbeb19ea..39cdde0b9491 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1489,51 +1489,61 @@ static inline int sata_srst_pmp(struct ata_link *link)
 	return link->pmp;
 }
 
-/*
- * printk helpers
- */
-__printf(3, 4)
-void ata_port_printk(const struct ata_port *ap, const char *level,
-		     const char *fmt, ...);
-__printf(3, 4)
-void ata_link_printk(const struct ata_link *link, const char *level,
-		     const char *fmt, ...);
-__printf(3, 4)
-void ata_dev_printk(const struct ata_device *dev, const char *level,
-		    const char *fmt, ...);
+#define ata_port_printk(level, ap, fmt, ...)			\
+	pr_ ## level ("ata%u: " fmt, (ap)->print_id, ##__VA_ARGS__)
 
 #define ata_port_err(ap, fmt, ...)				\
-	ata_port_printk(ap, KERN_ERR, fmt, ##__VA_ARGS__)
+	ata_port_printk(err, ap, fmt, ##__VA_ARGS__)
 #define ata_port_warn(ap, fmt, ...)				\
-	ata_port_printk(ap, KERN_WARNING, fmt, ##__VA_ARGS__)
+	ata_port_printk(warn, ap, fmt, ##__VA_ARGS__)
 #define ata_port_notice(ap, fmt, ...)				\
-	ata_port_printk(ap, KERN_NOTICE, fmt, ##__VA_ARGS__)
+	ata_port_printk(notice, ap, fmt, ##__VA_ARGS__)
 #define ata_port_info(ap, fmt, ...)				\
-	ata_port_printk(ap, KERN_INFO, fmt, ##__VA_ARGS__)
+	ata_port_printk(info, ap, fmt, ##__VA_ARGS__)
 #define ata_port_dbg(ap, fmt, ...)				\
-	ata_port_printk(ap, KERN_DEBUG, fmt, ##__VA_ARGS__)
+	ata_port_printk(debug, ap, fmt, ##__VA_ARGS__)
+
+#define ata_link_printk(level, link, fmt, ...)			\
+do {								\
+	if (sata_pmp_attached((link)->ap) ||			\
+	    (link)->ap->slave_link)				\
+		pr_ ## level ("ata%u.%02u: " fmt,		\
+			      (link)->ap->print_id,		\
+			      (link)->pmp,			\
+			      ##__VA_ARGS__);			\
+        else							\
+		pr_ ## level ("ata%u: " fmt,			\
+			      (link)->ap->print_id,		\
+			      ##__VA_ARGS__);			\
+} while (0)
 
 #define ata_link_err(link, fmt, ...)				\
-	ata_link_printk(link, KERN_ERR, fmt, ##__VA_ARGS__)
+	ata_link_printk(err, link, fmt, ##__VA_ARGS__)
 #define ata_link_warn(link, fmt, ...)				\
-	ata_link_printk(link, KERN_WARNING, fmt, ##__VA_ARGS__)
+	ata_link_printk(warn, link, fmt, ##__VA_ARGS__)
 #define ata_link_notice(link, fmt, ...)				\
-	ata_link_printk(link, KERN_NOTICE, fmt, ##__VA_ARGS__)
+	ata_link_printk(notice, link, fmt, ##__VA_ARGS__)
 #define ata_link_info(link, fmt, ...)				\
-	ata_link_printk(link, KERN_INFO, fmt, ##__VA_ARGS__)
+	ata_link_printk(info, link, fmt, ##__VA_ARGS__)
 #define ata_link_dbg(link, fmt, ...)				\
-	ata_link_printk(link, KERN_DEBUG, fmt, ##__VA_ARGS__)
+	ata_link_printk(debug, link, fmt, ##__VA_ARGS__)
+
+#define ata_dev_printk(level, dev, fmt, ...)			\
+        pr_ ## level("ata%u.%02u: " fmt,			\
+               (dev)->link->ap->print_id,			\
+	       (dev)->link->pmp + (dev)->devno,			\
+	       ##__VA_ARGS__)
 
 #define ata_dev_err(dev, fmt, ...)				\
-	ata_dev_printk(dev, KERN_ERR, fmt, ##__VA_ARGS__)
+	ata_dev_printk(err, dev, fmt, ##__VA_ARGS__)
 #define ata_dev_warn(dev, fmt, ...)				\
-	ata_dev_printk(dev, KERN_WARNING, fmt, ##__VA_ARGS__)
+	ata_dev_printk(warn, dev, fmt, ##__VA_ARGS__)
 #define ata_dev_notice(dev, fmt, ...)				\
-	ata_dev_printk(dev, KERN_NOTICE, fmt, ##__VA_ARGS__)
+	ata_dev_printk(notice, dev, fmt, ##__VA_ARGS__)
 #define ata_dev_info(dev, fmt, ...)				\
-	ata_dev_printk(dev, KERN_INFO, fmt, ##__VA_ARGS__)
+	ata_dev_printk(info, dev, fmt, ##__VA_ARGS__)
 #define ata_dev_dbg(dev, fmt, ...)				\
-	ata_dev_printk(dev, KERN_DEBUG, fmt, ##__VA_ARGS__)
+	ata_dev_printk(debug, dev, fmt, ##__VA_ARGS__)
 
 void ata_print_version(const struct device *dev, const char *version);
 
@@ -2067,11 +2077,8 @@ static inline u8 ata_wait_idle(struct ata_port *ap)
 {
 	u8 status = ata_sff_busy_wait(ap, ATA_BUSY | ATA_DRQ, 1000);
 
-#ifdef ATA_DEBUG
 	if (status != 0xff && (status & (ATA_BUSY | ATA_DRQ)))
-		ata_port_printk(ap, KERN_DEBUG, "abnormal Status 0x%X\n",
-				status);
-#endif
+		ata_port_dbg(ap, "abnormal Status 0x%X\n", status);
 
 	return status;
 }
-- 
cgit v1.2.3


From d97c75edd806669c9f4b56c0ddae37725c0b708c Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 21 Dec 2021 08:21:01 +0100
Subject: ata: libata: drop ata_msg_error() and ata_msg_intr()

Unused.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 39cdde0b9491..4f0a85f4e69a 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -78,8 +78,6 @@ enum {
 	ATA_MSG_WARN	= 0x0008,
 	ATA_MSG_MALLOC	= 0x0010,
 	ATA_MSG_CTL	= 0x0020,
-	ATA_MSG_INTR	= 0x0040,
-	ATA_MSG_ERR	= 0x0080,
 };
 
 #define ata_msg_drv(p)    ((p)->msg_enable & ATA_MSG_DRV)
@@ -88,8 +86,6 @@ enum {
 #define ata_msg_warn(p)   ((p)->msg_enable & ATA_MSG_WARN)
 #define ata_msg_malloc(p) ((p)->msg_enable & ATA_MSG_MALLOC)
 #define ata_msg_ctl(p)    ((p)->msg_enable & ATA_MSG_CTL)
-#define ata_msg_intr(p)   ((p)->msg_enable & ATA_MSG_INTR)
-#define ata_msg_err(p)    ((p)->msg_enable & ATA_MSG_ERR)
 
 static inline u32 ata_msg_init(int dval, int default_msg_enable_bits)
 {
-- 
cgit v1.2.3


From 5cef96b4207e01c9cdb7752acaa178056fe94632 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 21 Dec 2021 08:21:02 +0100
Subject: ata: libata: drop ata_msg_ctl()

The one caller have been converted to dynamic debugging.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 4f0a85f4e69a..e384cce62963 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -77,7 +77,6 @@ enum {
 	ATA_MSG_PROBE	= 0x0004,
 	ATA_MSG_WARN	= 0x0008,
 	ATA_MSG_MALLOC	= 0x0010,
-	ATA_MSG_CTL	= 0x0020,
 };
 
 #define ata_msg_drv(p)    ((p)->msg_enable & ATA_MSG_DRV)
@@ -85,7 +84,6 @@ enum {
 #define ata_msg_probe(p)  ((p)->msg_enable & ATA_MSG_PROBE)
 #define ata_msg_warn(p)   ((p)->msg_enable & ATA_MSG_WARN)
 #define ata_msg_malloc(p) ((p)->msg_enable & ATA_MSG_MALLOC)
-#define ata_msg_ctl(p)    ((p)->msg_enable & ATA_MSG_CTL)
 
 static inline u32 ata_msg_init(int dval, int default_msg_enable_bits)
 {
-- 
cgit v1.2.3


From 2f784b923d50cdef1f6bd24d7c18614321b0833a Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 21 Dec 2021 08:21:03 +0100
Subject: ata: libata: drop ata_msg_malloc()

Unused.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index e384cce62963..5651bbf4902b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -76,14 +76,12 @@ enum {
 	ATA_MSG_INFO	= 0x0002,
 	ATA_MSG_PROBE	= 0x0004,
 	ATA_MSG_WARN	= 0x0008,
-	ATA_MSG_MALLOC	= 0x0010,
 };
 
 #define ata_msg_drv(p)    ((p)->msg_enable & ATA_MSG_DRV)
 #define ata_msg_info(p)   ((p)->msg_enable & ATA_MSG_INFO)
 #define ata_msg_probe(p)  ((p)->msg_enable & ATA_MSG_PROBE)
 #define ata_msg_warn(p)   ((p)->msg_enable & ATA_MSG_WARN)
-#define ata_msg_malloc(p) ((p)->msg_enable & ATA_MSG_MALLOC)
 
 static inline u32 ata_msg_init(int dval, int default_msg_enable_bits)
 {
-- 
cgit v1.2.3


From 16d424672716dc886fb58ec4a47a408db4781cc0 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 21 Dec 2021 08:21:04 +0100
Subject: ata: libata: drop ata_msg_warn()

The WARN level was always enabled, so drop ata_msg_warn().

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5651bbf4902b..0e5ed2ff94be 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -75,13 +75,11 @@ enum {
 	ATA_MSG_DRV	= 0x0001,
 	ATA_MSG_INFO	= 0x0002,
 	ATA_MSG_PROBE	= 0x0004,
-	ATA_MSG_WARN	= 0x0008,
 };
 
 #define ata_msg_drv(p)    ((p)->msg_enable & ATA_MSG_DRV)
 #define ata_msg_info(p)   ((p)->msg_enable & ATA_MSG_INFO)
 #define ata_msg_probe(p)  ((p)->msg_enable & ATA_MSG_PROBE)
-#define ata_msg_warn(p)   ((p)->msg_enable & ATA_MSG_WARN)
 
 static inline u32 ata_msg_init(int dval, int default_msg_enable_bits)
 {
-- 
cgit v1.2.3


From 17a1e1be2fc7dc99945b41df0485037dcb6044d0 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 21 Dec 2021 08:21:05 +0100
Subject: ata: libata: drop ata_msg_probe()

All callsites have been converted to dynamic debugging.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0e5ed2ff94be..455d7e77e562 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -74,12 +74,10 @@
 enum {
 	ATA_MSG_DRV	= 0x0001,
 	ATA_MSG_INFO	= 0x0002,
-	ATA_MSG_PROBE	= 0x0004,
 };
 
 #define ata_msg_drv(p)    ((p)->msg_enable & ATA_MSG_DRV)
 #define ata_msg_info(p)   ((p)->msg_enable & ATA_MSG_INFO)
-#define ata_msg_probe(p)  ((p)->msg_enable & ATA_MSG_PROBE)
 
 static inline u32 ata_msg_init(int dval, int default_msg_enable_bits)
 {
-- 
cgit v1.2.3


From 96c810f216cb6da15bfa8fe8ef3bf73ca91c5dd8 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 21 Dec 2021 08:21:06 +0100
Subject: ata: libata: drop ata_msg_info()

Convert the sole caller to ata_dev_dbg() and remove the definition.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 455d7e77e562..524d09b1dc82 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -73,11 +73,9 @@
 
 enum {
 	ATA_MSG_DRV	= 0x0001,
-	ATA_MSG_INFO	= 0x0002,
 };
 
 #define ata_msg_drv(p)    ((p)->msg_enable & ATA_MSG_DRV)
-#define ata_msg_info(p)   ((p)->msg_enable & ATA_MSG_INFO)
 
 static inline u32 ata_msg_init(int dval, int default_msg_enable_bits)
 {
-- 
cgit v1.2.3


From 1c95a27c1e544f723f6e0e5a4384098f92996ec0 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 21 Dec 2021 08:21:07 +0100
Subject: ata: libata: drop ata_msg_drv()

Callers are already protected by ata_dev_print_info(), so no need
to have an additional configuration parameter here.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 524d09b1dc82..65172609a005 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -71,12 +71,6 @@
 /* NEW: debug levels */
 #define HAVE_LIBATA_MSG 1
 
-enum {
-	ATA_MSG_DRV	= 0x0001,
-};
-
-#define ata_msg_drv(p)    ((p)->msg_enable & ATA_MSG_DRV)
-
 static inline u32 ata_msg_init(int dval, int default_msg_enable_bits)
 {
 	if (dval < 0 || dval >= (sizeof(u32) * 8))
-- 
cgit v1.2.3


From db45905e74e6ae035305719bc683eca40f526669 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 21 Dec 2021 08:21:08 +0100
Subject: ata: libata: remove 'new' ata message handling

Remove the remaining bits for the 'new' ata message handling.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 65172609a005..145c0132b75e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -68,18 +68,6 @@
 	}							\
 })
 
-/* NEW: debug levels */
-#define HAVE_LIBATA_MSG 1
-
-static inline u32 ata_msg_init(int dval, int default_msg_enable_bits)
-{
-	if (dval < 0 || dval >= (sizeof(u32) * 8))
-		return default_msg_enable_bits; /* should be 0x1 - only driver info msgs */
-	if (!dval)
-		return 0;
-	return (1 << dval) - 1;
-}
-
 /* defines only for the constants which don't work well as enums */
 #define ATA_TAG_POISON		0xfafbfcfdU
 
@@ -864,7 +852,6 @@ struct ata_port {
 
 	unsigned int		hsm_task_state;
 
-	u32			msg_enable;
 	struct list_head	eh_done_q;
 	wait_queue_head_t	eh_wait_q;
 	int			eh_tries;
-- 
cgit v1.2.3


From 870bb833c0acb29d8471eac5c2d2e6274826dbb6 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 21 Dec 2021 08:21:09 +0100
Subject: ata: libata: remove debug compilation switches

Unused now, so remove and drop any references to them.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 145c0132b75e..c258f69106f4 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -39,25 +39,9 @@
  * compile-time options: to be removed as soon as all the drivers are
  * converted to the new debugging mechanism
  */
-#undef ATA_DEBUG		/* debugging output */
-#undef ATA_VERBOSE_DEBUG	/* yet more debugging output */
 #undef ATA_IRQ_TRAP		/* define to ack screaming irqs */
-#undef ATA_NDEBUG		/* define to disable quick runtime checks */
 
 
-/* note: prints function name for you */
-#ifdef ATA_DEBUG
-#define DPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args)
-#ifdef ATA_VERBOSE_DEBUG
-#define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args)
-#else
-#define VPRINTK(fmt, args...)
-#endif	/* ATA_VERBOSE_DEBUG */
-#else
-#define DPRINTK(fmt, args...)
-#define VPRINTK(fmt, args...)
-#endif	/* ATA_DEBUG */
-
 #define ata_print_version_once(dev, version)			\
 ({								\
 	static bool __print_once;				\
-- 
cgit v1.2.3


From cc4b08c31b5c51352f258032cc65e884b3e61e6a Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Tue, 7 Dec 2021 21:15:31 +0900
Subject: can: do not increase tx_bytes statistics for RTR frames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The actual payload length of the CAN Remote Transmission Request (RTR)
frames is always 0, i.e. no payload is transmitted on the wire.
However, those RTR frames still use the DLC to indicate the length of
the requested frame.

As such, net_device_stats::tx_bytes should not be increased when
sending RTR frames.

The function can_get_echo_skb() already returns the correct length,
even for RTR frames (c.f. [1]). However, for historical reasons, the
drivers do not use can_get_echo_skb()'s return value and instead, most
of them store a temporary length (or dlc) in some local structure or
array. Using the return value of can_get_echo_skb() solves the
issue. After doing this, such length/dlc fields become unused and so
this patch does the adequate cleaning when needed.

This patch fixes all the CAN drivers.

Finally, can_get_echo_skb() is decorated with the __must_check
attribute in order to force future drivers to correctly use its return
value (else the compiler would emit a warning).

[1] commit ed3320cec279 ("can: dev: __can_get_echo_skb():
fix real payload length return value for RTR frames")

Link: https://lore.kernel.org/all/20211207121531.42941-6-mailhol.vincent@wanadoo.fr
Cc: Nicolas Ferre <nicolas.ferre@microchip.com>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Ludovic Desroches <ludovic.desroches@microchip.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Chen-Yu Tsai <wens@csie.org>
Cc: Jernej Skrabec <jernej.skrabec@gmail.com>
Cc: Yasushi SHOJI <yashi@spacecubics.com>
Cc: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: Stephane Grosjean <s.grosjean@peak-system.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Tested-by: Jimmy Assarsson <extja@kvaser.com> # kvaser
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Acked-by: Stefan Mätje <stefan.maetje@esd.eu> # esd_usb2
Tested-by: Stefan Mätje <stefan.maetje@esd.eu> # esd_usb2
[mkl: add conversion for grcan]
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/skb.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index d311bc369a39..fdb22b00674a 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -21,8 +21,9 @@ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
 		     unsigned int idx, unsigned int frame_len);
 struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx,
 				   u8 *len_ptr, unsigned int *frame_len_ptr);
-unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx,
-			      unsigned int *frame_len_ptr);
+unsigned int __must_check can_get_echo_skb(struct net_device *dev,
+					   unsigned int idx,
+					   unsigned int *frame_len_ptr);
 void can_free_echo_skb(struct net_device *dev, unsigned int idx,
 		       unsigned int *frame_len_ptr);
 struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf);
-- 
cgit v1.2.3


From c9e1d8ed304cc6106c3241add170193995953325 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Tue, 14 Dec 2021 01:02:23 +0900
Subject: can: dev: replace can_priv::ctrlmode_static by
 can_get_static_ctrlmode()

The statically enabled features of a CAN controller can be retrieved
using below formula:

| u32 ctrlmode_static = priv->ctrlmode & ~priv->ctrlmode_supported;

As such, there is no need to store this information. This patch remove
the field ctrlmode_static of struct can_priv and provides, in
replacement, the inline function can_get_static_ctrlmode() which
returns the same value.

Link: https://lore.kernel.org/all/20211213160226.56219-2-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 45f19d9db5ca..92e2d69462f0 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -69,7 +69,6 @@ struct can_priv {
 	/* CAN controller features - see include/uapi/linux/can/netlink.h */
 	u32 ctrlmode;		/* current options setting */
 	u32 ctrlmode_supported;	/* options that can be modified by netlink */
-	u32 ctrlmode_static;	/* static enabled options for driver/hardware */
 
 	int restart_ms;
 	struct delayed_work restart_work;
@@ -139,13 +138,17 @@ static inline void can_set_static_ctrlmode(struct net_device *dev,
 
 	/* alloc_candev() succeeded => netdev_priv() is valid at this point */
 	priv->ctrlmode = static_mode;
-	priv->ctrlmode_static = static_mode;
 
 	/* override MTU which was set by default in can_setup()? */
 	if (static_mode & CAN_CTRLMODE_FD)
 		dev->mtu = CANFD_MTU;
 }
 
+static inline u32 can_get_static_ctrlmode(struct can_priv *priv)
+{
+	return priv->ctrlmode & ~priv->ctrlmode_supported;
+}
+
 void can_setup(struct net_device *dev);
 
 struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
-- 
cgit v1.2.3


From 7d4a101c0bd3c6e5c6e45c705a54f7bc8f6c128d Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Tue, 14 Dec 2021 01:02:24 +0900
Subject: can: dev: add sanity check in can_set_static_ctrlmode()

Previous patch removed can_priv::ctrlmode_static to replace it with
can_get_static_ctrlmode().

A condition sine qua non for this to work is that the controller
static modes should never be set in can_priv::ctrlmode_supported
(c.f. the comment on can_priv::ctrlmode_supported which states that it
is for "options that can be *modified* by netlink"). Also, this
condition is already correctly fulfilled by all existing drivers
which rely on the ctrlmode_static feature.

Nonetheless, we added an extra safeguard in can_set_static_ctrlmode()
to return an error value and to warn the developer who would be
adventurous enough to set to static a given feature that is already
set to supported.

The drivers which rely on the static controller mode are then updated
to check the return value of can_set_static_ctrlmode().

Link: https://lore.kernel.org/all/20211213160226.56219-3-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 92e2d69462f0..fff3f70df697 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -131,17 +131,24 @@ static inline s32 can_get_relative_tdco(const struct can_priv *priv)
 }
 
 /* helper to define static CAN controller features at device creation time */
-static inline void can_set_static_ctrlmode(struct net_device *dev,
-					   u32 static_mode)
+static inline int __must_check can_set_static_ctrlmode(struct net_device *dev,
+						       u32 static_mode)
 {
 	struct can_priv *priv = netdev_priv(dev);
 
 	/* alloc_candev() succeeded => netdev_priv() is valid at this point */
+	if (priv->ctrlmode_supported & static_mode) {
+		netdev_warn(dev,
+			    "Controller features can not be supported and static at the same time\n");
+		return -EINVAL;
+	}
 	priv->ctrlmode = static_mode;
 
 	/* override MTU which was set by default in can_setup()? */
 	if (static_mode & CAN_CTRLMODE_FD)
 		dev->mtu = CANFD_MTU;
+
+	return 0;
 }
 
 static inline u32 can_get_static_ctrlmode(struct can_priv *priv)
-- 
cgit v1.2.3


From 5fe1be81efd28bf2afcac218f23118dca6b1b648 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Tue, 14 Dec 2021 01:02:25 +0900
Subject: can: dev: reorder struct can_priv members for better packing

Save eight bytes of holes on x86-64 architectures by reordering the
members of struct can_priv.

Before:

| $ pahole -C can_priv drivers/net/can/dev/dev.o
| struct can_priv {
| 	struct net_device *        dev;                  /*     0     8 */
| 	struct can_device_stats    can_stats;            /*     8    24 */
| 	const struct can_bittiming_const  * bittiming_const; /*    32     8 */
| 	const struct can_bittiming_const  * data_bittiming_const; /*    40     8 */
| 	struct can_bittiming       bittiming;            /*    48    32 */
| 	/* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */
| 	struct can_bittiming       data_bittiming;       /*    80    32 */
| 	const struct can_tdc_const  * tdc_const;         /*   112     8 */
| 	struct can_tdc             tdc;                  /*   120    12 */
| 	/* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */
| 	unsigned int               bitrate_const_cnt;    /*   132     4 */
| 	const u32  *               bitrate_const;        /*   136     8 */
| 	const u32  *               data_bitrate_const;   /*   144     8 */
| 	unsigned int               data_bitrate_const_cnt; /*   152     4 */
| 	u32                        bitrate_max;          /*   156     4 */
| 	struct can_clock           clock;                /*   160     4 */
| 	unsigned int               termination_const_cnt; /*   164     4 */
| 	const u16  *               termination_const;    /*   168     8 */
| 	u16                        termination;          /*   176     2 */
|
| 	/* XXX 6 bytes hole, try to pack */
|
| 	struct gpio_desc *         termination_gpio;     /*   184     8 */
| 	/* --- cacheline 3 boundary (192 bytes) --- */
| 	u16                        termination_gpio_ohms[2]; /*   192     4 */
| 	enum can_state             state;                /*   196     4 */
| 	u32                        ctrlmode;             /*   200     4 */
| 	u32                        ctrlmode_supported;   /*   204     4 */
| 	int                        restart_ms;           /*   208     4 */
|
| 	/* XXX 4 bytes hole, try to pack */
|
| 	struct delayed_work        restart_work;         /*   216    88 */
|
| 	/* XXX last struct has 4 bytes of padding */
|
| 	/* --- cacheline 4 boundary (256 bytes) was 48 bytes ago --- */
| 	int                        (*do_set_bittiming)(struct net_device *); /*   304     8 */
| 	int                        (*do_set_data_bittiming)(struct net_device *); /*   312     8 */
| 	/* --- cacheline 5 boundary (320 bytes) --- */
| 	int                        (*do_set_mode)(struct net_device *, enum can_mode); /*   320     8 */
| 	int                        (*do_set_termination)(struct net_device *, u16); /*   328     8 */
| 	int                        (*do_get_state)(const struct net_device  *, enum can_state *); /*   336     8 */
| 	int                        (*do_get_berr_counter)(const struct net_device  *, struct can_berr_counter *); /*   344     8 */
| 	unsigned int               echo_skb_max;         /*   352     4 */
|
| 	/* XXX 4 bytes hole, try to pack */
|
| 	struct sk_buff * *         echo_skb;             /*   360     8 */
|
| 	/* size: 368, cachelines: 6, members: 32 */
| 	/* sum members: 354, holes: 3, sum holes: 14 */
| 	/* paddings: 1, sum paddings: 4 */
| 	/* last cacheline: 48 bytes */
| };

After:

| $ pahole -C can_priv drivers/net/can/dev/dev.o
| struct can_priv {
| 	struct net_device *        dev;                  /*     0     8 */
| 	struct can_device_stats    can_stats;            /*     8    24 */
| 	const struct can_bittiming_const  * bittiming_const; /*    32     8 */
| 	const struct can_bittiming_const  * data_bittiming_const; /*    40     8 */
| 	struct can_bittiming       bittiming;            /*    48    32 */
| 	/* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */
| 	struct can_bittiming       data_bittiming;       /*    80    32 */
| 	const struct can_tdc_const  * tdc_const;         /*   112     8 */
| 	struct can_tdc             tdc;                  /*   120    12 */
| 	/* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */
| 	unsigned int               bitrate_const_cnt;    /*   132     4 */
| 	const u32  *               bitrate_const;        /*   136     8 */
| 	const u32  *               data_bitrate_const;   /*   144     8 */
| 	unsigned int               data_bitrate_const_cnt; /*   152     4 */
| 	u32                        bitrate_max;          /*   156     4 */
| 	struct can_clock           clock;                /*   160     4 */
| 	unsigned int               termination_const_cnt; /*   164     4 */
| 	const u16  *               termination_const;    /*   168     8 */
| 	u16                        termination;          /*   176     2 */
|
| 	/* XXX 6 bytes hole, try to pack */
|
| 	struct gpio_desc *         termination_gpio;     /*   184     8 */
| 	/* --- cacheline 3 boundary (192 bytes) --- */
| 	u16                        termination_gpio_ohms[2]; /*   192     4 */
| 	unsigned int               echo_skb_max;         /*   196     4 */
| 	struct sk_buff * *         echo_skb;             /*   200     8 */
| 	enum can_state             state;                /*   208     4 */
| 	u32                        ctrlmode;             /*   212     4 */
| 	u32                        ctrlmode_supported;   /*   216     4 */
| 	int                        restart_ms;           /*   220     4 */
| 	struct delayed_work        restart_work;         /*   224    88 */
|
| 	/* XXX last struct has 4 bytes of padding */
|
| 	/* --- cacheline 4 boundary (256 bytes) was 56 bytes ago --- */
| 	int                        (*do_set_bittiming)(struct net_device *); /*   312     8 */
| 	/* --- cacheline 5 boundary (320 bytes) --- */
| 	int                        (*do_set_data_bittiming)(struct net_device *); /*   320     8 */
| 	int                        (*do_set_mode)(struct net_device *, enum can_mode); /*   328     8 */
| 	int                        (*do_set_termination)(struct net_device *, u16); /*   336     8 */
| 	int                        (*do_get_state)(const struct net_device  *, enum can_state *); /*   344     8 */
| 	int                        (*do_get_berr_counter)(const struct net_device  *, struct can_berr_counter *); /*   352     8 */
|
| 	/* size: 360, cachelines: 6, members: 32 */
| 	/* sum members: 354, holes: 1, sum holes: 6 */
| 	/* paddings: 1, sum paddings: 4 */
| 	/* last cacheline: 40 bytes */
| };

Link: https://lore.kernel.org/all/20211213160226.56219-4-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index fff3f70df697..c2ea47f30046 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -64,6 +64,9 @@ struct can_priv {
 	struct gpio_desc *termination_gpio;
 	u16 termination_gpio_ohms[CAN_TERMINATION_GPIO_MAX];
 
+	unsigned int echo_skb_max;
+	struct sk_buff **echo_skb;
+
 	enum can_state state;
 
 	/* CAN controller features - see include/uapi/linux/can/netlink.h */
@@ -83,9 +86,6 @@ struct can_priv {
 				   struct can_berr_counter *bec);
 	int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv);
 
-	unsigned int echo_skb_max;
-	struct sk_buff **echo_skb;
-
 #ifdef CONFIG_CAN_LEDS
 	struct led_trigger *tx_led_trig;
 	char tx_led_trig_name[CAN_LED_NAME_SZ];
-- 
cgit v1.2.3


From c6af53f038aa32cec12e8a305ba07c7ef168f1b0 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 4 Jan 2022 12:07:00 +0000
Subject: net: mdio: add helpers to extract clause 45 regad and devad fields

Add a couple of helpers and definitions to extract the clause 45 regad
and devad fields from the regnum passed into MDIO drivers.

Tested-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 9f3587a61e14..ecac96d52e01 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -7,6 +7,7 @@
 #define __LINUX_MDIO_H__
 
 #include <uapi/linux/mdio.h>
+#include <linux/bitfield.h>
 #include <linux/mod_devicetable.h>
 
 /* Or MII_ADDR_C45 into regnum for read/write on mii_bus to enable the 21 bit
@@ -14,6 +15,7 @@
  */
 #define MII_ADDR_C45		(1<<30)
 #define MII_DEVADDR_C45_SHIFT	16
+#define MII_DEVADDR_C45_MASK	GENMASK(20, 16)
 #define MII_REGADDR_C45_MASK	GENMASK(15, 0)
 
 struct gpio_desc;
@@ -381,6 +383,16 @@ static inline u32 mdiobus_c45_addr(int devad, u16 regnum)
 	return MII_ADDR_C45 | devad << MII_DEVADDR_C45_SHIFT | regnum;
 }
 
+static inline u16 mdiobus_c45_regad(u32 regnum)
+{
+	return FIELD_GET(MII_REGADDR_C45_MASK, regnum);
+}
+
+static inline u16 mdiobus_c45_devad(u32 regnum)
+{
+	return FIELD_GET(MII_DEVADDR_C45_MASK, regnum);
+}
+
 static inline int __mdiobus_c45_read(struct mii_bus *bus, int prtad, int devad,
 				     u16 regnum)
 {
-- 
cgit v1.2.3


From 01ec4a2e8f01f027a0f06cad237c935da8d643bf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 3 Jul 2021 00:23:39 +0200
Subject: headers/deps: USB: Optimize <linux/usb/ch9.h> dependencies, remove
 <linux/device.h>

The <linux/usb/ch9.h> header is used over 1,400 times in a typical distro
build, but few of its users actually need the full <linux/device.h> header.

          --------------------------------------------------------------------
          | Combined, preprocessed C code size of header, without line markers,
          | with comments stripped:
          -------------------------
  before: | #include <linux/usb/ch9.h>              | LOC:  7,078 | headers:  172
   after: | #include <linux/usb/ch9.h>              | LOC:    812 | headers:   38

Remove it and add it to the places that need it.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/ch9.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index 1cffa34740b0..969e7dba6358 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -33,7 +33,6 @@
 #ifndef __LINUX_USB_CH9_H
 #define __LINUX_USB_CH9_H
 
-#include <linux/device.h>
 #include <uapi/linux/usb/ch9.h>
 
 /* USB 3.2 SuperSpeed Plus phy signaling rate generation and lane count */
@@ -45,6 +44,8 @@ enum usb_ssp_rate {
 	USB_SSP_GEN_2x2,
 };
 
+struct device;
+
 extern const char *usb_ep_type_string(int ep_type);
 extern const char *usb_speed_string(enum usb_device_speed speed);
 extern enum usb_device_speed usb_get_maximum_speed(struct device *dev);
-- 
cgit v1.2.3


From edce22e19bfa86efa2522d041d6367f2f099e8ed Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Wed, 5 Jan 2022 09:05:15 -0800
Subject: block: move rq_list macros to blk-mq.h

Move the request list macros to the header file that defines that struct
they operate on.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220105170518.3181469-2-kbusch@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 29 +++++++++++++++++++++++++++++
 include/linux/blkdev.h | 29 -----------------------------
 2 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 550996cf419c..bf64b94cd64e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -216,6 +216,35 @@ static inline unsigned short req_get_ioprio(struct request *req)
 #define rq_dma_dir(rq) \
 	(op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
 
+#define rq_list_add(listptr, rq)	do {		\
+	(rq)->rq_next = *(listptr);			\
+	*(listptr) = rq;				\
+} while (0)
+
+#define rq_list_pop(listptr)				\
+({							\
+	struct request *__req = NULL;			\
+	if ((listptr) && *(listptr))	{		\
+		__req = *(listptr);			\
+		*(listptr) = __req->rq_next;		\
+	}						\
+	__req;						\
+})
+
+#define rq_list_peek(listptr)				\
+({							\
+	struct request *__req = NULL;			\
+	if ((listptr) && *(listptr))			\
+		__req = *(listptr);			\
+	__req;						\
+})
+
+#define rq_list_for_each(listptr, pos)			\
+	for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos))
+
+#define rq_list_next(rq)	(rq)->rq_next
+#define rq_list_empty(list)	((list) == (struct request *) NULL)
+
 enum blk_eh_timer_return {
 	BLK_EH_DONE,		/* drivers has completed the command */
 	BLK_EH_RESET_TIMER,	/* reset timer and try again */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 22746b2d6825..9c95df26fc26 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1339,33 +1339,4 @@ struct io_comp_batch {
 
 #define DEFINE_IO_COMP_BATCH(name)	struct io_comp_batch name = { }
 
-#define rq_list_add(listptr, rq)	do {		\
-	(rq)->rq_next = *(listptr);			\
-	*(listptr) = rq;				\
-} while (0)
-
-#define rq_list_pop(listptr)				\
-({							\
-	struct request *__req = NULL;			\
-	if ((listptr) && *(listptr))	{		\
-		__req = *(listptr);			\
-		*(listptr) = __req->rq_next;		\
-	}						\
-	__req;						\
-})
-
-#define rq_list_peek(listptr)				\
-({							\
-	struct request *__req = NULL;			\
-	if ((listptr) && *(listptr))			\
-		__req = *(listptr);			\
-	__req;						\
-})
-
-#define rq_list_for_each(listptr, pos)			\
-	for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos))
-
-#define rq_list_next(rq)	(rq)->rq_next
-#define rq_list_empty(list)	((list) == (struct request *) NULL)
-
 #endif /* _LINUX_BLKDEV_H */
-- 
cgit v1.2.3


From 3764fd05e1f89530e2ee5cbff0b638f2b1141b90 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Wed, 5 Jan 2022 09:05:16 -0800
Subject: block: introduce rq_list_for_each_safe macro

While iterating a list, a particular request may need to be removed for
special handling. Provide an iterator that can safely handle that.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20220105170518.3181469-3-kbusch@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index bf64b94cd64e..1467f0fa2142 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -242,6 +242,10 @@ static inline unsigned short req_get_ioprio(struct request *req)
 #define rq_list_for_each(listptr, pos)			\
 	for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos))
 
+#define rq_list_for_each_safe(listptr, pos, nxt)			\
+	for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos);	\
+		pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL)
+
 #define rq_list_next(rq)	(rq)->rq_next
 #define rq_list_empty(list)	((list) == (struct request *) NULL)
 
-- 
cgit v1.2.3


From d2528be7a8b09af9796a270debd14101a72bb552 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Wed, 5 Jan 2022 09:05:17 -0800
Subject: block: introduce rq_list_move

When iterating a list, a particular request may need to be moved for
special handling. Provide a helper function to achieve that so drivers
don't need to reimplement rqlist manipulation.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220105170518.3181469-4-kbusch@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1467f0fa2142..f40a05ecca4a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -249,6 +249,23 @@ static inline unsigned short req_get_ioprio(struct request *req)
 #define rq_list_next(rq)	(rq)->rq_next
 #define rq_list_empty(list)	((list) == (struct request *) NULL)
 
+/**
+ * rq_list_move() - move a struct request from one list to another
+ * @src: The source list @rq is currently in
+ * @dst: The destination list that @rq will be appended to
+ * @rq: The request to move
+ * @prev: The request preceding @rq in @src (NULL if @rq is the head)
+ */
+static void inline rq_list_move(struct request **src, struct request **dst,
+				struct request *rq, struct request *prev)
+{
+	if (prev)
+		prev->rq_next = rq->rq_next;
+	else
+		*src = rq->rq_next;
+	rq_list_add(dst, rq);
+}
+
 enum blk_eh_timer_return {
 	BLK_EH_DONE,		/* drivers has completed the command */
 	BLK_EH_RESET_TIMER,	/* reset timer and try again */
-- 
cgit v1.2.3


From d53ad5d8b218a885e95080d4d3d556b16b91b1b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Mon, 3 Jan 2022 16:08:09 +0100
Subject: xdp: Move conversion to xdp_frame out of map functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All map redirect functions except XSK maps convert xdp_buff to xdp_frame
before enqueueing it. So move this conversion of out the map functions
and into xdp_do_redirect(). This removes a bit of duplicated code, but more
importantly it makes it possible to support caller-allocated xdp_frame
structures, which will be added in a subsequent commit.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220103150812.87914-5-toke@redhat.com
---
 include/linux/bpf.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 26753139d5b4..6e947cd91152 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1669,17 +1669,17 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
 struct btf *bpf_get_btf_vmlinux(void);
 
 /* Map specifics */
-struct xdp_buff;
+struct xdp_frame;
 struct sk_buff;
 struct bpf_dtab_netdev;
 struct bpf_cpu_map_entry;
 
 void __dev_flush(void);
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx);
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx);
-int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
 			  struct bpf_map *map, bool exclude_ingress);
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 			     struct bpf_prog *xdp_prog);
@@ -1688,7 +1688,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
 			   bool exclude_ingress);
 
 void __cpu_map_flush(void);
-int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx);
 int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
 			     struct sk_buff *skb);
@@ -1866,26 +1866,26 @@ static inline void __dev_flush(void)
 {
 }
 
-struct xdp_buff;
+struct xdp_frame;
 struct bpf_dtab_netdev;
 struct bpf_cpu_map_entry;
 
 static inline
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx)
 {
 	return 0;
 }
 
 static inline
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx)
 {
 	return 0;
 }
 
 static inline
-int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
 			  struct bpf_map *map, bool exclude_ingress)
 {
 	return 0;
@@ -1913,7 +1913,7 @@ static inline void __cpu_map_flush(void)
 }
 
 static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
-				  struct xdp_buff *xdp,
+				  struct xdp_frame *xdpf,
 				  struct net_device *dev_rx)
 {
 	return 0;
-- 
cgit v1.2.3


From 1372d34ccf6dd480332b2bcb2fd59a2b9a0df415 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Mon, 3 Jan 2022 16:08:10 +0100
Subject: xdp: Add xdp_do_redirect_frame() for pre-computed xdp_frames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an xdp_do_redirect_frame() variant which supports pre-computed
xdp_frame structures. This will be used in bpf_prog_run() to avoid having
to write to the xdp_frame structure when the XDP program doesn't modify the
frame boundaries.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220103150812.87914-6-toke@redhat.com
---
 include/linux/filter.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 60eec80fa1d4..71fa57b88bfc 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1019,6 +1019,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 int xdp_do_redirect(struct net_device *dev,
 		    struct xdp_buff *xdp,
 		    struct bpf_prog *prog);
+int xdp_do_redirect_frame(struct net_device *dev,
+			  struct xdp_buff *xdp,
+			  struct xdp_frame *xdpf,
+			  struct bpf_prog *prog);
 void xdp_do_flush(void);
 
 /* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as
-- 
cgit v1.2.3


From ae16d059f8c9409eba0c412639def0494765b761 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 26 Oct 2021 18:22:44 +0200
Subject: mm/slub: Make object_err() static

There are no callers outside of mm/slub.c anymore.

Move freelist_corrupted() that calls object_err() to avoid a need for
forward declaration.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Roman Gushchin <guro@fb.com>
---
 include/linux/slub_def.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 0fa751b946fa..1ef68d4de9c0 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -156,9 +156,6 @@ static inline void sysfs_slab_release(struct kmem_cache *s)
 }
 #endif
 
-void object_err(struct kmem_cache *s, struct page *page,
-		u8 *object, char *reason);
-
 void *fixup_red_left(struct kmem_cache *s, void *p);
 
 static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
-- 
cgit v1.2.3


From d122019bf061cccc4583eb9ad40bf58c2fe517be Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 4 Oct 2021 14:45:51 +0100
Subject: mm: Split slab into its own type

Make struct slab independent of struct page. It still uses the
underlying memory in struct page for storing slab-specific data, but
slab and slub can now be weaned off using struct page directly.  Some of
the wrapper functions (slab_address() and slab_order()) still need to
cast to struct folio, but this is a significant disentanglement.

[ vbabka@suse.cz: Rebase on folios, use folio instead of page where
  possible.

  Do not duplicate flags field in struct slab, instead make the related
  accessors go through slab_folio(). For testing pfmemalloc use the
  folio_*_active flag accessors directly so the PageSlabPfmemalloc
  wrappers can be removed later.

  Make folio_slab() expect only folio_test_slab() == true folios and
  virt_to_slab() return NULL when folio_test_slab() == false.

  Move struct slab to mm/slab.h.

  Don't represent with struct slab pages that are not true slab pages,
  but just a compound page obtained directly rom page allocator (with
  large kmalloc() for SLUB and SLOB). ]

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
---
 include/linux/mm_types.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c3a6e6209600..1ae3537c7920 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -56,11 +56,11 @@ struct mem_cgroup;
  * in each subpage, but you may need to restore some of their values
  * afterwards.
  *
- * SLUB uses cmpxchg_double() to atomically update its freelist and
- * counters.  That requires that freelist & counters be adjacent and
- * double-word aligned.  We align all struct pages to double-word
- * boundaries, and ensure that 'freelist' is aligned within the
- * struct.
+ * SLUB uses cmpxchg_double() to atomically update its freelist and counters.
+ * That requires that freelist & counters in struct slab be adjacent and
+ * double-word aligned. Because struct slab currently just reinterprets the
+ * bits of struct page, we align all struct pages to double-word boundaries,
+ * and ensure that 'freelist' is aligned within struct slab.
  */
 #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
 #define _struct_page_alignment	__aligned(2 * sizeof(unsigned long))
-- 
cgit v1.2.3


From 0b3eb091d5759479d44cb793fad2c51ea06bdcec Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 4 Oct 2021 14:45:56 +0100
Subject: mm: Convert check_heap_object() to use struct slab

Ensure that we're not seeing a tail page inside __check_heap_object() by
converting to a slab instead of a page.  Take the opportunity to mark
the slab as const since we're not modifying it.  Also move the
declaration of __check_heap_object() to mm/slab.h so it's not available
to the wider kernel.

[ vbabka@suse.cz: in check_heap_object() only convert to struct slab for
  actual PageSlab pages; use folio as intermediate step instead of page ]

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Roman Gushchin <guro@fb.com>
---
 include/linux/slab.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 181045148b06..367366f1d1ff 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -189,14 +189,6 @@ bool kmem_valid_obj(void *object);
 void kmem_dump_obj(void *object);
 #endif
 
-#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
-void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
-			bool to_user);
-#else
-static inline void __check_heap_object(const void *ptr, unsigned long n,
-				       struct page *page, bool to_user) { }
-#endif
-
 /*
  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
  * alignment larger than the alignment of a 64-bit integer.
-- 
cgit v1.2.3


From bb192ed9aa7191a5d65548f82c42b6750d65f569 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 3 Nov 2021 15:39:59 +0100
Subject: mm/slub: Convert most struct page to struct slab by spatch

The majority of conversion from struct page to struct slab in SLUB
internals can be delegated to a coccinelle semantic patch. This includes
renaming of variables with 'page' in name to 'slab', and similar.

Big thanks to Julia Lawall and Luis Chamberlain for help with
coccinelle.

// Options: --include-headers --no-includes --smpl-spacing include/linux/slub_def.h mm/slub.c
// Note: needs coccinelle 1.1.1 to avoid breaking whitespace, and ocaml for the
// embedded script

// build list of functions to exclude from applying the next rule
@initialize:ocaml@
@@

let ok_function p =
  not (List.mem (List.hd p).current_element ["nearest_obj";"obj_to_index";"objs_per_slab_page";"__slab_lock";"__slab_unlock";"free_nonslab_page";"kmalloc_large_node"])

// convert the type from struct page to struct page in all functions except the
// list from previous rule
// this also affects struct kmem_cache_cpu, but that's ok
@@
position p : script:ocaml() { ok_function p };
@@

- struct page@p
+ struct slab

// in struct kmem_cache_cpu, change the name from page to slab
// the type was already converted by the previous rule
@@
@@

struct kmem_cache_cpu {
...
-struct slab *page;
+struct slab *slab;
...
}

// there are many places that use c->page which is now c->slab after the
// previous rule
@@
struct kmem_cache_cpu *c;
@@

-c->page
+c->slab

@@
@@

struct kmem_cache {
...
- unsigned int cpu_partial_pages;
+ unsigned int cpu_partial_slabs;
...
}

@@
struct kmem_cache *s;
@@

- s->cpu_partial_pages
+ s->cpu_partial_slabs

@@
@@

static void
- setup_page_debug(
+ setup_slab_debug(
 ...)
 {...}

@@
@@

- setup_page_debug(
+ setup_slab_debug(
 ...);

// for all functions (with exceptions), change any "struct slab *page"
// parameter to "struct slab *slab" in the signature, and generally all
// occurences of "page" to "slab" in the body - with some special cases.

@@
identifier fn !~ "free_nonslab_page|obj_to_index|objs_per_slab_page|nearest_obj";
@@
 fn(...,
-   struct slab *page
+   struct slab *slab
    ,...)
 {
<...
- page
+ slab
...>
 }

// similar to previous but the param is called partial_page
@@
identifier fn;
@@

 fn(...,
-   struct slab *partial_page
+   struct slab *partial_slab
    ,...)
 {
<...
- partial_page
+ partial_slab
...>
 }

// similar to previous but for functions that take pointer to struct page ptr
@@
identifier fn;
@@

 fn(...,
-   struct slab **ret_page
+   struct slab **ret_slab
    ,...)
 {
<...
- ret_page
+ ret_slab
...>
 }

// functions converted by previous rules that were temporarily called using
// slab_page(E) so we want to remove the wrapper now that they accept struct
// slab ptr directly
@@
identifier fn =~ "slab_free|do_slab_free";
expression E;
@@

 fn(...,
- slab_page(E)
+ E
  ,...)

// similar to previous but for another pattern
@@
identifier fn =~ "slab_pad_check|check_object";
@@

 fn(...,
- folio_page(folio, 0)
+ slab
  ,...)

// functions that were returning struct page ptr and now will return struct
// slab ptr, including slab_page() wrapper removal
@@
identifier fn =~ "allocate_slab|new_slab";
expression E;
@@

 static
-struct slab *
+struct slab *
 fn(...)
 {
<...
- slab_page(E)
+ E
...>
 }

// rename any former struct page * declarations
@@
@@

struct slab *
(
- page
+ slab
|
- partial_page
+ partial_slab
|
- oldpage
+ oldslab
)
;

// this has to be separate from previous rule as page and page2 appear at the
// same line
@@
@@

struct slab *
-page2
+slab2
;

// similar but with initial assignment
@@
expression E;
@@

struct slab *
(
- page
+ slab
|
- flush_page
+ flush_slab
|
- discard_page
+ slab_to_discard
|
- page_to_unfreeze
+ slab_to_unfreeze
)
= E;

// convert most of struct page to struct slab usage inside functions (with
// exceptions), including specific variable renames
@@
identifier fn !~ "nearest_obj|obj_to_index|objs_per_slab_page|__slab_(un)*lock|__free_slab|free_nonslab_page|kmalloc_large_node";
expression E;
@@

 fn(...)
 {
<...
(
- int pages;
+ int slabs;
|
- int pages = E;
+ int slabs = E;
|
- page
+ slab
|
- flush_page
+ flush_slab
|
- partial_page
+ partial_slab
|
- oldpage->pages
+ oldslab->slabs
|
- oldpage
+ oldslab
|
- unsigned int nr_pages;
+ unsigned int nr_slabs;
|
- nr_pages
+ nr_slabs
|
- unsigned int partial_pages = E;
+ unsigned int partial_slabs = E;
|
- partial_pages
+ partial_slabs
)
...>
 }

// this has to be split out from the previous rule so that lines containing
// multiple matching changes will be fully converted
@@
identifier fn !~ "nearest_obj|obj_to_index|objs_per_slab_page|__slab_(un)*lock|__free_slab|free_nonslab_page|kmalloc_large_node";
@@

 fn(...)
 {
<...
(
- slab->pages
+ slab->slabs
|
- pages
+ slabs
|
- page2
+ slab2
|
- discard_page
+ slab_to_discard
|
- page_to_unfreeze
+ slab_to_unfreeze
)
...>
 }

// after we simply changed all occurences of page to slab, some usages need
// adjustment for slab-specific functions, or use slab_page() wrapper
@@
identifier fn !~ "nearest_obj|obj_to_index|objs_per_slab_page|__slab_(un)*lock|__free_slab|free_nonslab_page|kmalloc_large_node";
@@

 fn(...)
 {
<...
(
- page_slab(slab)
+ slab
|
- kasan_poison_slab(slab)
+ kasan_poison_slab(slab_page(slab))
|
- page_address(slab)
+ slab_address(slab)
|
- page_size(slab)
+ slab_size(slab)
|
- PageSlab(slab)
+ folio_test_slab(slab_folio(slab))
|
- page_to_nid(slab)
+ slab_nid(slab)
|
- compound_order(slab)
+ slab_order(slab)
)
...>
 }

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/slub_def.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 1ef68d4de9c0..00d99afe1c0e 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -48,9 +48,9 @@ enum stat_item {
 struct kmem_cache_cpu {
 	void **freelist;	/* Pointer to next available object */
 	unsigned long tid;	/* Globally unique transaction id */
-	struct page *page;	/* The slab from which we are allocating */
+	struct slab *slab;	/* The slab from which we are allocating */
 #ifdef CONFIG_SLUB_CPU_PARTIAL
-	struct page *partial;	/* Partially allocated frozen slabs */
+	struct slab *partial;	/* Partially allocated frozen slabs */
 #endif
 	local_lock_t lock;	/* Protects the fields above */
 #ifdef CONFIG_SLUB_STATS
@@ -100,7 +100,7 @@ struct kmem_cache {
 	/* Number of per cpu partial objects to keep around */
 	unsigned int cpu_partial;
 	/* Number of per cpu partial pages to keep around */
-	unsigned int cpu_partial_pages;
+	unsigned int cpu_partial_slabs;
 #endif
 	struct kmem_cache_order_objects oo;
 
-- 
cgit v1.2.3


From c2092c12064a9728b2928979f88575cc1c2247fa Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 15 Nov 2021 16:55:15 +0100
Subject: mm/slub: Finish struct page to struct slab conversion

Update comments mentioning pages to mention slabs where appropriate.
Also some goto labels.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Roman Gushchin <guro@fb.com>
---
 include/linux/slub_def.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 00d99afe1c0e..8a9c2876ca89 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -99,7 +99,7 @@ struct kmem_cache {
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	/* Number of per cpu partial objects to keep around */
 	unsigned int cpu_partial;
-	/* Number of per cpu partial pages to keep around */
+	/* Number of per cpu partial slabs to keep around */
 	unsigned int cpu_partial_slabs;
 #endif
 	struct kmem_cache_order_objects oo;
-- 
cgit v1.2.3


From 40f3bf0cb04c91d33531b1b95788ad2f0e4062cf Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 2 Nov 2021 15:42:04 +0100
Subject: mm: Convert struct page to struct slab in functions used by other
 subsystems

KASAN, KFENCE and memcg interact with SLAB or SLUB internals through
functions nearest_obj(), obj_to_index() and objs_per_slab() that use
struct page as parameter. This patch converts it to struct slab
including all callers, through a coccinelle semantic patch.

// Options: --include-headers --no-includes --smpl-spacing include/linux/slab_def.h include/linux/slub_def.h mm/slab.h mm/kasan/*.c mm/kfence/kfence_test.c mm/memcontrol.c mm/slab.c mm/slub.c
// Note: needs coccinelle 1.1.1 to avoid breaking whitespace

@@
@@

-objs_per_slab_page(
+objs_per_slab(
 ...
 )
 { ... }

@@
@@

-objs_per_slab_page(
+objs_per_slab(
 ...
 )

@@
identifier fn =~ "obj_to_index|objs_per_slab";
@@

 fn(...,
-   const struct page *page
+   const struct slab *slab
    ,...)
 {
<...
(
- page_address(page)
+ slab_address(slab)
|
- page
+ slab
)
...>
 }

@@
identifier fn =~ "nearest_obj";
@@

 fn(...,
-   struct page *page
+   const struct slab *slab
    ,...)
 {
<...
(
- page_address(page)
+ slab_address(slab)
|
- page
+ slab
)
...>
 }

@@
identifier fn =~ "nearest_obj|obj_to_index|objs_per_slab";
expression E;
@@

 fn(...,
(
- slab_page(E)
+ E
|
- virt_to_page(E)
+ virt_to_slab(E)
|
- virt_to_head_page(E)
+ virt_to_slab(E)
|
- page
+ page_slab(page)
)
  ,...)

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <kasan-dev@googlegroups.com>
Cc: <cgroups@vger.kernel.org>
---
 include/linux/slab_def.h | 16 ++++++++--------
 include/linux/slub_def.h | 18 +++++++++---------
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 3aa5e1e73ab6..e24c9aff6fed 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -87,11 +87,11 @@ struct kmem_cache {
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
-static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
+static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab,
 				void *x)
 {
-	void *object = x - (x - page->s_mem) % cache->size;
-	void *last_object = page->s_mem + (cache->num - 1) * cache->size;
+	void *object = x - (x - slab->s_mem) % cache->size;
+	void *last_object = slab->s_mem + (cache->num - 1) * cache->size;
 
 	if (unlikely(object > last_object))
 		return last_object;
@@ -106,16 +106,16 @@ static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
  *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
  */
 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
-					const struct page *page, void *obj)
+					const struct slab *slab, void *obj)
 {
-	u32 offset = (obj - page->s_mem);
+	u32 offset = (obj - slab->s_mem);
 	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 }
 
-static inline int objs_per_slab_page(const struct kmem_cache *cache,
-				     const struct page *page)
+static inline int objs_per_slab(const struct kmem_cache *cache,
+				     const struct slab *slab)
 {
-	if (is_kfence_address(page_address(page)))
+	if (is_kfence_address(slab_address(slab)))
 		return 1;
 	return cache->num;
 }
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 8a9c2876ca89..33c5c0e3bd8d 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -158,11 +158,11 @@ static inline void sysfs_slab_release(struct kmem_cache *s)
 
 void *fixup_red_left(struct kmem_cache *s, void *p);
 
-static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
+static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab,
 				void *x) {
-	void *object = x - (x - page_address(page)) % cache->size;
-	void *last_object = page_address(page) +
-		(page->objects - 1) * cache->size;
+	void *object = x - (x - slab_address(slab)) % cache->size;
+	void *last_object = slab_address(slab) +
+		(slab->objects - 1) * cache->size;
 	void *result = (unlikely(object > last_object)) ? last_object : object;
 
 	result = fixup_red_left(cache, result);
@@ -178,16 +178,16 @@ static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
 }
 
 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
-					const struct page *page, void *obj)
+					const struct slab *slab, void *obj)
 {
 	if (is_kfence_address(obj))
 		return 0;
-	return __obj_to_index(cache, page_address(page), obj);
+	return __obj_to_index(cache, slab_address(slab), obj);
 }
 
-static inline int objs_per_slab_page(const struct kmem_cache *cache,
-				     const struct page *page)
+static inline int objs_per_slab(const struct kmem_cache *cache,
+				     const struct slab *slab)
 {
-	return page->objects;
+	return slab->objects;
 }
 #endif /* _LINUX_SLUB_DEF_H */
-- 
cgit v1.2.3


From 4b5f8d9a895ada8e0abb58ccd35d9fe229e3a595 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 2 Nov 2021 22:42:04 +0100
Subject: mm/memcg: Convert slab objcgs from struct page to struct slab

page->memcg_data is used with MEMCG_DATA_OBJCGS flag only for slab pages
so convert all the related infrastructure to struct slab. Also use
struct folio instead of struct page when resolving object pointers.

This is not just mechanistic changing of types and names. Now in
mem_cgroup_from_obj() we use folio_test_slab() to decide if we interpret
the folio as a real slab instead of a large kmalloc, instead of relying
on MEMCG_DATA_OBJCGS bit that used to be checked in page_objcgs_check().
Similarly in memcg_slab_free_hook() where we can encounter
kmalloc_large() pages (here the folio slab flag check is implied by
virt_to_slab()). As a result, page_objcgs_check() can be dropped instead
of converted.

To avoid include cycles, move the inline definition of slab_objcgs()
from memcontrol.h to mm/slab.h.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <cgroups@vger.kernel.org>
---
 include/linux/memcontrol.h | 48 ----------------------------------------------
 1 file changed, 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c5c403f4be6..e34112f6a369 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -536,45 +536,6 @@ static inline bool folio_memcg_kmem(struct folio *folio)
 	return folio->memcg_data & MEMCG_DATA_KMEM;
 }
 
-/*
- * page_objcgs - get the object cgroups vector associated with a page
- * @page: a pointer to the page struct
- *
- * Returns a pointer to the object cgroups vector associated with the page,
- * or NULL. This function assumes that the page is known to have an
- * associated object cgroups vector. It's not safe to call this function
- * against pages, which might have an associated memory cgroup: e.g.
- * kernel stack pages.
- */
-static inline struct obj_cgroup **page_objcgs(struct page *page)
-{
-	unsigned long memcg_data = READ_ONCE(page->memcg_data);
-
-	VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page);
-	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
-
-	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-}
-
-/*
- * page_objcgs_check - get the object cgroups vector associated with a page
- * @page: a pointer to the page struct
- *
- * Returns a pointer to the object cgroups vector associated with the page,
- * or NULL. This function is safe to use if the page can be directly associated
- * with a memory cgroup.
- */
-static inline struct obj_cgroup **page_objcgs_check(struct page *page)
-{
-	unsigned long memcg_data = READ_ONCE(page->memcg_data);
-
-	if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS))
-		return NULL;
-
-	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
-
-	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-}
 
 #else
 static inline bool folio_memcg_kmem(struct folio *folio)
@@ -582,15 +543,6 @@ static inline bool folio_memcg_kmem(struct folio *folio)
 	return false;
 }
 
-static inline struct obj_cgroup **page_objcgs(struct page *page)
-{
-	return NULL;
-}
-
-static inline struct obj_cgroup **page_objcgs_check(struct page *page)
-{
-	return NULL;
-}
 #endif
 
 static inline bool PageMemcgKmem(struct page *page)
-- 
cgit v1.2.3


From 6e48a966dfd18987fec9385566a67d36e2b5fc11 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 4 Oct 2021 14:46:46 +0100
Subject: mm/kasan: Convert to struct folio and struct slab

KASAN accesses some slab related struct page fields so we need to
convert it to struct slab. Some places are a bit simplified thanks to
kasan_addr_to_slab() encapsulating the PageSlab flag check through
virt_to_slab().  When resolving object address to either a real slab or
a large kmalloc, use struct folio as the intermediate type for testing
the slab flag to avoid unnecessary implicit compound_head().

[ vbabka@suse.cz: use struct folio, adjust to differences in previous
  patches ]

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Tested-by: Hyeongogn Yoo <42.hyeyoo@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: <kasan-dev@googlegroups.com>
---
 include/linux/kasan.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d8783b682669..fb78108d694e 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -9,6 +9,7 @@
 
 struct kmem_cache;
 struct page;
+struct slab;
 struct vm_struct;
 struct task_struct;
 
@@ -193,11 +194,11 @@ static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache)
 	return 0;
 }
 
-void __kasan_poison_slab(struct page *page);
-static __always_inline void kasan_poison_slab(struct page *page)
+void __kasan_poison_slab(struct slab *slab);
+static __always_inline void kasan_poison_slab(struct slab *slab)
 {
 	if (kasan_enabled())
-		__kasan_poison_slab(page);
+		__kasan_poison_slab(slab);
 }
 
 void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -322,7 +323,7 @@ static inline void kasan_cache_create(struct kmem_cache *cache,
 				      slab_flags_t *flags) {}
 static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
 static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
-static inline void kasan_poison_slab(struct page *page) {}
+static inline void kasan_poison_slab(struct slab *slab) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
 					void *object) {}
 static inline void kasan_poison_object_data(struct kmem_cache *cache,
-- 
cgit v1.2.3


From c5e97ed154589524a1df4ae2be55c4cfdb0d0573 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 4 Oct 2021 14:46:48 +0100
Subject: bootmem: Use page->index instead of page->freelist

page->freelist is for the use of slab.  Using page->index is the same
set of bits as page->freelist, and by using an integer instead of a
pointer, we can avoid casts.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: <x86@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 include/linux/bootmem_info.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index 2bc8b1f69c93..cc35d010fa94 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -30,7 +30,7 @@ void put_page_bootmem(struct page *page);
  */
 static inline void free_bootmem_page(struct page *page)
 {
-	unsigned long magic = (unsigned long)page->freelist;
+	unsigned long magic = page->index;
 
 	/*
 	 * The reserve_bootmem_region sets the reserved flag on bootmem
-- 
cgit v1.2.3


From 007747a984ea5e895b7d8b056b24ebf431e1e71d Mon Sep 17 00:00:00 2001
From: Miroslav Lichvar <mlichvar@redhat.com>
Date: Wed, 5 Jan 2022 11:33:26 +0100
Subject: net: fix SOF_TIMESTAMPING_BIND_PHC to work with multiple sockets

When multiple sockets using the SOF_TIMESTAMPING_BIND_PHC flag received
a packet with a hardware timestamp (e.g. multiple PTP instances in
different PTP domains using the UDPv4/v6 multicast or L2 transport),
the timestamps received on some sockets were corrupted due to repeated
conversion of the same timestamp (by the same or different vclocks).

Fix ptp_convert_timestamp() to not modify the shared skb timestamp
and return the converted timestamp as a ktime_t instead. If the
conversion fails, return 0 to not confuse the application with
timestamps corresponding to an unexpected PHC.

Fixes: d7c088265588 ("net: socket: support hardware timestamp conversion to PHC bound")
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Yangbo Lu <yangbo.lu@nxp.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptp_clock_kernel.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 2e5565067355..554454cb8693 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -351,15 +351,17 @@ int ptp_get_vclocks_index(int pclock_index, int **vclock_index);
  *
  * @hwtstamps:    skb_shared_hwtstamps structure pointer
  * @vclock_index: phc index of ptp vclock.
+ *
+ * Returns converted timestamp, or 0 on error.
  */
-void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
-			   int vclock_index);
+ktime_t ptp_convert_timestamp(const struct skb_shared_hwtstamps *hwtstamps,
+			      int vclock_index);
 #else
 static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index)
 { return 0; }
-static inline void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
-					 int vclock_index)
-{ }
+static inline ktime_t ptp_convert_timestamp(const struct skb_shared_hwtstamps *hwtstamps,
+					    int vclock_index)
+{ return 0; }
 
 #endif
 
-- 
cgit v1.2.3


From eac1b93c14d645ef147b049ace0d5230df755548 Mon Sep 17 00:00:00 2001
From: Coco Li <lixiaoyan@google.com>
Date: Wed, 5 Jan 2022 02:48:38 -0800
Subject: gro: add ability to control gro max packet size

Eric Dumazet suggested to allow users to modify max GRO packet size.

We have seen GRO being disabled by users of appliances (such as
wifi access points) because of claimed bufferbloat issues,
or some work arounds in sch_cake, to split GRO/GSO packets.

Instead of disabling GRO completely, one can chose to limit
the maximum packet size of GRO packets, depending on their
latency constraints.

This patch adds a per device gro_max_size attribute
that can be changed with ip link command.

ip link set dev eth0 gro_max_size 16000

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Coco Li <lixiaoyan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6f99c8f51b60..3213c7227b59 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1942,6 +1942,8 @@ enum netdev_ml_priv_type {
  *			dev->addr_list_lock.
  *	@unlink_list:	As netif_addr_lock() can be called recursively,
  *			keep a list of interfaces to be deleted.
+ *	@gro_max_size:	Maximum size of aggregated packet in generic
+ *			receive offload (GRO)
  *
  *	@dev_addr_shadow:	Copy of @dev_addr to catch direct writes.
  *	@linkwatch_dev_tracker:	refcount tracker used by linkwatch.
@@ -2131,6 +2133,8 @@ struct net_device {
 	struct bpf_prog __rcu	*xdp_prog;
 	unsigned long		gro_flush_timeout;
 	int			napi_defer_hard_irqs;
+#define GRO_MAX_SIZE		65536
+	unsigned int		gro_max_size;
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
 
@@ -4806,6 +4810,13 @@ static inline void netif_set_gso_max_segs(struct net_device *dev,
 	WRITE_ONCE(dev->gso_max_segs, segs);
 }
 
+static inline void netif_set_gro_max_size(struct net_device *dev,
+					  unsigned int size)
+{
+	/* This pairs with the READ_ONCE() in skb_gro_receive() */
+	WRITE_ONCE(dev->gro_max_size, size);
+}
+
 static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
 					int pulled_hlen, u16 mac_offset,
 					int mac_len)
-- 
cgit v1.2.3


From 3809fe479861194e310c23ed48b010c7c0f72d22 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 16 Dec 2021 10:21:57 +0100
Subject: HID: address kernel-doc warnings

The command ./scripts/kernel-doc -none include/linux/hid.h reports:

  include/linux/hid.h:818: warning: cannot understand function prototype: 'struct hid_ll_driver '
  include/linux/hid.h:1135: warning: expecting prototype for hid_may_wakeup(). Prototype was for hid_hw_may_wakeup() instead

Address those kernel-doc warnings.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index f453be385bd4..aaf89a8a836c 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -788,7 +788,7 @@ struct hid_driver {
 	container_of(pdrv, struct hid_driver, driver)
 
 /**
- * hid_ll_driver - low level driver callbacks
+ * struct hid_ll_driver - low level driver callbacks
  * @start: called on probe to start the device
  * @stop: called on remove
  * @open: called by input layer on open
@@ -1158,7 +1158,7 @@ static inline int hid_hw_idle(struct hid_device *hdev, int report, int idle,
 }
 
 /**
- * hid_may_wakeup - return if the hid device may act as a wakeup source during system-suspend
+ * hid_hw_may_wakeup - return if the hid device may act as a wakeup source during system-suspend
  *
  * @hdev: hid device
  */
-- 
cgit v1.2.3


From 36dacddbf0bdba86cd00f066b4d724157eeb63f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20M=C3=BCller?= <dmueller@suse.de>
Date: Wed, 5 Jan 2022 17:38:47 +0100
Subject: lib/raid6: Use strict priority ranking for pq gen() benchmarking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On x86_64, currently 3 variants of AVX512, 3 variants of AVX2
and 3 variants of SSE2 are benchmarked on initialization, taking
between 144-153 jiffies. Testing across a hardware pool of
various generations of intel cpus I could not find a single
case where SSE2 won over AVX2 or AVX512. There are cases where
AVX2 wins over AVX512 however.

Change "prefer" into an integer priority field (similar to
how recov selection works) to have more than one ranking level
available, which is backwards compatible with existing behavior.

Give AVX2/512 variants higher priority over SSE2 in order to skip
SSE testing when AVX is available. in a AVX2/x86_64/HZ=250 case this
saves in the order of 200ms of initialization time.

Signed-off-by: Dirk Müller <dmueller@suse.de>
Acked-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Song Liu <song@kernel.org>
---
 include/linux/raid/pq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 154e954b711d..d6e5a1feb947 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -81,7 +81,7 @@ struct raid6_calls {
 	void (*xor_syndrome)(int, int, int, size_t, void **);
 	int  (*valid)(void);	/* Returns 1 if this routine set is usable */
 	const char *name;	/* Name of this routine set */
-	int prefer;		/* Has special performance attribute */
+	int priority;		/* Relative priority ranking if non-zero */
 };
 
 /* Selected algorithm */
-- 
cgit v1.2.3


From 07f910f9b7295b6a28b337fedb56e612684c5659 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 4 Oct 2021 14:46:50 +0100
Subject: mm: Remove slab from struct page

All members of struct slab can now be removed from struct page.
This shrinks the definition of struct page by 30 LOC, making
it easier to understand.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm_types.h   | 28 ----------------------------
 include/linux/page-flags.h | 37 -------------------------------------
 2 files changed, 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1ae3537c7920..646f3ed4f6df 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -118,31 +118,6 @@ struct page {
 				atomic_long_t pp_frag_count;
 			};
 		};
-		struct {	/* slab, slob and slub */
-			union {
-				struct list_head slab_list;
-				struct {	/* Partial pages */
-					struct page *next;
-#ifdef CONFIG_64BIT
-					int pages;	/* Nr of pages left */
-#else
-					short int pages;
-#endif
-				};
-			};
-			struct kmem_cache *slab_cache; /* not slob */
-			/* Double-word boundary */
-			void *freelist;		/* first free object */
-			union {
-				void *s_mem;	/* slab: first object */
-				unsigned long counters;		/* SLUB */
-				struct {			/* SLUB */
-					unsigned inuse:16;
-					unsigned objects:15;
-					unsigned frozen:1;
-				};
-			};
-		};
 		struct {	/* Tail pages of compound page */
 			unsigned long compound_head;	/* Bit zero is set */
 
@@ -206,9 +181,6 @@ struct page {
 		 * which are currently stored here.
 		 */
 		unsigned int page_type;
-
-		unsigned int active;		/* SLAB */
-		int units;			/* SLOB */
 	};
 
 	/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index b5f14d581113..1b08e33265fa 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -909,43 +909,6 @@ extern bool is_free_buddy_page(struct page *page);
 
 __PAGEFLAG(Isolated, isolated, PF_ANY);
 
-/*
- * If network-based swap is enabled, sl*b must keep track of whether pages
- * were allocated from pfmemalloc reserves.
- */
-static inline int PageSlabPfmemalloc(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageSlab(page), page);
-	return PageActive(page);
-}
-
-/*
- * A version of PageSlabPfmemalloc() for opportunistic checks where the page
- * might have been freed under us and not be a PageSlab anymore.
- */
-static inline int __PageSlabPfmemalloc(struct page *page)
-{
-	return PageActive(page);
-}
-
-static inline void SetPageSlabPfmemalloc(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageSlab(page), page);
-	SetPageActive(page);
-}
-
-static inline void __ClearPageSlabPfmemalloc(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageSlab(page), page);
-	__ClearPageActive(page);
-}
-
-static inline void ClearPageSlabPfmemalloc(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageSlab(page), page);
-	ClearPageActive(page);
-}
-
 #ifdef CONFIG_MMU
 #define __PG_MLOCKED		(1UL << PG_mlocked)
 #else
-- 
cgit v1.2.3


From b5e7b59c3480f355910f9d2c6ece5857922a5e54 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 28 Sep 2021 09:47:57 +1000
Subject: NFS: change nfs_access_get_cached to only report the mask

Currently the nfs_access_get_cached family of functions report a
'struct nfs_access_entry' as the result, with both .mask and .cred set.
However the .cred is never used.  This is probably good and there is no
guarantee that it won't be freed before use.

Change to only report the 'mask' - as this is all that is used or needed.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 05f249f20f55..f33559acbcc2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -533,8 +533,8 @@ extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh,
 			struct nfs_fattr *fattr);
 extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags);
 extern void nfs_access_zap_cache(struct inode *inode);
-extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res,
-				 bool may_block);
+extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred,
+				 u32 *mask, bool may_block);
 
 /*
  * linux/fs/nfs/symlink.c
-- 
cgit v1.2.3


From 73fbb3fa647bdb5b60469af8101c741ece03a825 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 28 Sep 2021 09:47:57 +1000
Subject: NFS: pass cred explicitly for access tests

Storing the 'struct cred *' in nfs_access_entry is problematic.
An active 'cred' can keep a 'struct key *' active, and a quota is
imposed on the number of such keys that a user can maintain.
Cached 'nfs_access_entry' structs have indefinite lifetime, and having
these keep 'struct key's alive imposes on that quota.

So a future patch will remove the ->cred ref from nfs_access_entry.

To prepare, change various functions to not assume there is a 'cred' in
the nfs_access_entry, but to pass the cred around explicitly.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_fs.h  | 2 +-
 include/linux/nfs_xdr.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f33559acbcc2..c33b1b792bc9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -396,7 +396,7 @@ extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fa
 extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_getattr(struct user_namespace *, const struct path *,
 		       struct kstat *, u32, unsigned int);
-extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
+extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *, const struct cred *);
 extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
 extern int nfs_permission(struct user_namespace *, struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 967a0098f0a9..9102bdaa3faa 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1737,7 +1737,7 @@ struct nfs_rpc_ops {
 			    struct nfs_fh *, struct nfs_fattr *);
 	int	(*lookupp) (struct inode *, struct nfs_fh *,
 			    struct nfs_fattr *);
-	int	(*access)  (struct inode *, struct nfs_access_entry *);
+	int	(*access)  (struct inode *, struct nfs_access_entry *, const struct cred *);
 	int	(*readlink)(struct inode *, struct page *, unsigned int,
 			    unsigned int);
 	int	(*create)  (struct inode *, struct dentry *,
-- 
cgit v1.2.3


From 6238aec83f3fb12132f964937e5bbcf248fea8f9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 28 Sep 2021 09:47:57 +1000
Subject: NFS: don't store 'struct cred *' in struct nfs_access_entry

Storing the 'struct cred *' in nfs_access_entry is problematic.
An active 'cred' can keep a 'struct key *' active, and a quota is
imposed on the number of such keys that a user can maintain.
Cached 'nfs_access_entry' structs have indefinite lifetime, and having
these keep 'struct key's alive imposes on that quota.

So remove the 'struct cred *' and replace it with the fields we need:
  kuid_t, kgid_t, and struct group_info *

This makes the 'struct nfs_access_entry' 64 bits larger.

New function "access_cmp" is introduced which is identical to
cred_fscmp() except that the second arg is an 'nfs_access_entry', rather
than a 'cred'

Fixes: b68572e07c58 ("NFS: change access cache to use 'struct cred'.")
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_fs.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c33b1b792bc9..450e393d4bf2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -61,7 +61,9 @@
 struct nfs_access_entry {
 	struct rb_node		rb_node;
 	struct list_head	lru;
-	const struct cred *	cred;
+	kuid_t			fsuid;
+	kgid_t			fsgid;
+	struct group_info	*group_info;
 	__u32			mask;
 	struct rcu_head		rcu_head;
 };
-- 
cgit v1.2.3


From 1ab5be4ac5b1c9ce39ce1037c45b68d2ce6eede0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 17 Dec 2021 15:36:54 -0500
Subject: NFSv4: Add some support for case insensitive filesystems

Add capabilities to allow the NFS client to recognise when it is dealing
with case insensitive and case preserving filesystems.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_fs_sb.h | 2 ++
 include/linux/nfs_xdr.h   | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 2a9acbfe00f0..f24fc67af42d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -271,6 +271,8 @@ struct nfs_server {
 #define NFS_CAP_ACLS		(1U << 3)
 #define NFS_CAP_ATOMIC_OPEN	(1U << 4)
 #define NFS_CAP_LGOPEN		(1U << 5)
+#define NFS_CAP_CASE_INSENSITIVE	(1U << 6)
+#define NFS_CAP_CASE_PRESERVING	(1U << 7)
 #define NFS_CAP_POSIX_LOCK	(1U << 14)
 #define NFS_CAP_UIDGID_NOMAP	(1U << 15)
 #define NFS_CAP_STATEID_NFSV41	(1U << 16)
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9102bdaa3faa..ddff92396a3f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1194,6 +1194,8 @@ struct nfs4_server_caps_res {
 	u32				has_links;
 	u32				has_symlinks;
 	u32				fh_expire_type;
+	u32				case_insensitive;
+	u32				case_preserving;
 };
 
 #define NFS4_PATHNAME_MAXCOMPONENTS 512
-- 
cgit v1.2.3


From 703f7066f40599c290babdb79dd61319264987e9 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 7 Dec 2021 13:17:33 +0100
Subject: random: remove unused irq_flags argument from
 add_interrupt_randomness()

Since commit
   ee3e00e9e7101 ("random: use registers from interrupted code for CPU's w/o a cycle counter")

the irq_flags argument is no longer used.

Remove unused irq_flags.

Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: linux-hyperv@vger.kernel.org
Cc: x86@kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/random.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/random.h b/include/linux/random.h
index f45b8be3e3c4..c45b2693e51f 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -35,7 +35,7 @@ static inline void add_latent_entropy(void) {}
 
 extern void add_input_randomness(unsigned int type, unsigned int code,
 				 unsigned int value) __latent_entropy;
-extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
+extern void add_interrupt_randomness(int irq) __latent_entropy;
 
 extern void get_random_bytes(void *buf, int nbytes);
 extern int wait_for_random_bytes(void);
-- 
cgit v1.2.3


From 79b60ca83b6fa63ef307d2edcc77ee6581da8971 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Sun, 12 Dec 2021 14:51:27 +0200
Subject: net/mlx5: Introduce API for bulk request and release of IRQs

Currently IRQs are requested one by one. To balance spreading IRQs
among cpus using such scheme requires remembering cpu mask for the
cpus used for a given device. This complicates the IRQ allocation
scheme in subsequent patch.

Hence, prepare the code for bulk IRQs allocation. This enables
spreading IRQs among cpus in subsequent patch.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/eq.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index ea3ff5a8ced3..3705a382276b 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -9,13 +9,13 @@
 #define MLX5_NUM_SPARE_EQE (0x80)
 
 struct mlx5_eq;
+struct mlx5_irq;
 struct mlx5_core_dev;
 
 struct mlx5_eq_param {
-	u8             irq_index;
 	int            nent;
 	u64            mask[4];
-	cpumask_var_t  affinity;
+	struct mlx5_irq *irq;
 };
 
 struct mlx5_eq *
-- 
cgit v1.2.3


From 3663f26b389b3951426971b44bb9312fdff0efec Mon Sep 17 00:00:00 2001
From: Ajit Kumar Pandey <AjitKumar.Pandey@amd.com>
Date: Sun, 12 Dec 2021 23:35:24 +0530
Subject: drivers: acpi: acpi_apd: Remove unused device property "is-rv"

Initially "is-rv" device property is added for 48MHz fixed clock
support on Raven or RV architecture. It's unused now as we moved
to pci device_id based selection to extend such support on other
architectures. This change removed unused code from acpi driver.

Signed-off-by: Ajit Kumar Pandey <AjitKumar.Pandey@amd.com>
Reviewed-by: Mario Limonciello <Mario.Limonciello@amd.com>
Link: https://lore.kernel.org/r/20211212180527.1641362-3-AjitKumar.Pandey@amd.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/platform_data/clk-fch.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/clk-fch.h b/include/linux/platform_data/clk-fch.h
index b9f682459f08..850ca776156d 100644
--- a/include/linux/platform_data/clk-fch.h
+++ b/include/linux/platform_data/clk-fch.h
@@ -12,7 +12,6 @@
 
 struct fch_clk_data {
 	void __iomem *base;
-	u32 is_rv;
 };
 
 #endif /* __CLK_FCH_H */
-- 
cgit v1.2.3


From 7fdb98e8a768b3ccc05494d3ea4436047f512b9d Mon Sep 17 00:00:00 2001
From: Ajit Kumar Pandey <AjitKumar.Pandey@amd.com>
Date: Sun, 12 Dec 2021 23:35:25 +0530
Subject: ACPI: APD: Add a fmw property clk-name

Add a new device property to fetch clk-name from firmware.

Signed-off-by: Ajit Kumar Pandey <AjitKumar.Pandey@amd.com>
Reviewed-by: Mario Limonciello <Mario.Limonciello@amd.com>
Link: https://lore.kernel.org/r/20211212180527.1641362-4-AjitKumar.Pandey@amd.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/platform_data/clk-fch.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/clk-fch.h b/include/linux/platform_data/clk-fch.h
index 850ca776156d..11a2a23fd9b2 100644
--- a/include/linux/platform_data/clk-fch.h
+++ b/include/linux/platform_data/clk-fch.h
@@ -12,6 +12,7 @@
 
 struct fch_clk_data {
 	void __iomem *base;
+	char *name;
 };
 
 #endif /* __CLK_FCH_H */
-- 
cgit v1.2.3


From 2cee6fbb7f01bcb25f11ef1439e89a29de4c0c1d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Oct 2021 21:53:44 +0100
Subject: fscache: Remove the contents of the fscache driver, pending rewrite

Remove the code that comprises the fscache driver as it's going to be
substantially rewritten, with the majority of the code being erased in the
rewrite.

A small piece of linux/fscache.h is left as that is #included by a bunch of
network filesystems.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819578724.215744.18210619052245724238.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906884814.143852.6727245089843862889.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967077097.1823006.1377665951499979089.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021485548.640689.13876080567388696162.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 548 +--------------------------
 include/linux/fscache.h       | 851 +-----------------------------------------
 2 files changed, 3 insertions(+), 1396 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 8d39491c5f9f..47f21a53ac4b 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /* General filesystem caching backing cache interface
  *
- * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * NOTE!!! See:
@@ -15,551 +15,5 @@
 #define _LINUX_FSCACHE_CACHE_H
 
 #include <linux/fscache.h>
-#include <linux/sched.h>
-#include <linux/workqueue.h>
-
-#define NR_MAXCACHES BITS_PER_LONG
-
-struct fscache_cache;
-struct fscache_cache_ops;
-struct fscache_object;
-struct fscache_operation;
-
-enum fscache_obj_ref_trace {
-	fscache_obj_get_add_to_deps,
-	fscache_obj_get_queue,
-	fscache_obj_put_alloc_fail,
-	fscache_obj_put_attach_fail,
-	fscache_obj_put_drop_obj,
-	fscache_obj_put_enq_dep,
-	fscache_obj_put_queue,
-	fscache_obj_put_work,
-	fscache_obj_ref__nr_traces
-};
-
-/*
- * cache tag definition
- */
-struct fscache_cache_tag {
-	struct list_head	link;
-	struct fscache_cache	*cache;		/* cache referred to by this tag */
-	unsigned long		flags;
-#define FSCACHE_TAG_RESERVED	0		/* T if tag is reserved for a cache */
-	atomic_t		usage;
-	char			name[];	/* tag name */
-};
-
-/*
- * cache definition
- */
-struct fscache_cache {
-	const struct fscache_cache_ops *ops;
-	struct fscache_cache_tag *tag;		/* tag representing this cache */
-	struct kobject		*kobj;		/* system representation of this cache */
-	struct list_head	link;		/* link in list of caches */
-	size_t			max_index_size;	/* maximum size of index data */
-	char			identifier[36];	/* cache label */
-
-	/* node management */
-	struct work_struct	op_gc;		/* operation garbage collector */
-	struct list_head	object_list;	/* list of data/index objects */
-	struct list_head	op_gc_list;	/* list of ops to be deleted */
-	spinlock_t		object_list_lock;
-	spinlock_t		op_gc_list_lock;
-	atomic_t		object_count;	/* no. of live objects in this cache */
-	struct fscache_object	*fsdef;		/* object for the fsdef index */
-	unsigned long		flags;
-#define FSCACHE_IOERROR		0	/* cache stopped on I/O error */
-#define FSCACHE_CACHE_WITHDRAWN	1	/* cache has been withdrawn */
-};
-
-extern wait_queue_head_t fscache_cache_cleared_wq;
-
-/*
- * operation to be applied to a cache object
- * - retrieval initiation operations are done in the context of the process
- *   that issued them, and not in an async thread pool
- */
-typedef void (*fscache_operation_release_t)(struct fscache_operation *op);
-typedef void (*fscache_operation_processor_t)(struct fscache_operation *op);
-typedef void (*fscache_operation_cancel_t)(struct fscache_operation *op);
-
-enum fscache_operation_state {
-	FSCACHE_OP_ST_BLANK,		/* Op is not yet submitted */
-	FSCACHE_OP_ST_INITIALISED,	/* Op is initialised */
-	FSCACHE_OP_ST_PENDING,		/* Op is blocked from running */
-	FSCACHE_OP_ST_IN_PROGRESS,	/* Op is in progress */
-	FSCACHE_OP_ST_COMPLETE,		/* Op is complete */
-	FSCACHE_OP_ST_CANCELLED,	/* Op has been cancelled */
-	FSCACHE_OP_ST_DEAD		/* Op is now dead */
-};
-
-struct fscache_operation {
-	struct work_struct	work;		/* record for async ops */
-	struct list_head	pend_link;	/* link in object->pending_ops */
-	struct fscache_object	*object;	/* object to be operated upon */
-
-	unsigned long		flags;
-#define FSCACHE_OP_TYPE		0x000f	/* operation type */
-#define FSCACHE_OP_ASYNC	0x0001	/* - async op, processor may sleep for disk */
-#define FSCACHE_OP_MYTHREAD	0x0002	/* - processing is done be issuing thread, not pool */
-#define FSCACHE_OP_WAITING	4	/* cleared when op is woken */
-#define FSCACHE_OP_EXCLUSIVE	5	/* exclusive op, other ops must wait */
-#define FSCACHE_OP_DEC_READ_CNT	6	/* decrement object->n_reads on destruction */
-#define FSCACHE_OP_UNUSE_COOKIE	7	/* call fscache_unuse_cookie() on completion */
-#define FSCACHE_OP_KEEP_FLAGS	0x00f0	/* flags to keep when repurposing an op */
-
-	enum fscache_operation_state state;
-	atomic_t		usage;
-	unsigned		debug_id;	/* debugging ID */
-
-	/* operation processor callback
-	 * - can be NULL if FSCACHE_OP_WAITING is going to be used to perform
-	 *   the op in a non-pool thread */
-	fscache_operation_processor_t processor;
-
-	/* Operation cancellation cleanup (optional) */
-	fscache_operation_cancel_t cancel;
-
-	/* operation releaser */
-	fscache_operation_release_t release;
-};
-
-extern atomic_t fscache_op_debug_id;
-extern void fscache_op_work_func(struct work_struct *work);
-
-extern void fscache_enqueue_operation(struct fscache_operation *);
-extern void fscache_op_complete(struct fscache_operation *, bool);
-extern void fscache_put_operation(struct fscache_operation *);
-extern void fscache_operation_init(struct fscache_cookie *,
-				   struct fscache_operation *,
-				   fscache_operation_processor_t,
-				   fscache_operation_cancel_t,
-				   fscache_operation_release_t);
-
-/*
- * data read operation
- */
-struct fscache_retrieval {
-	struct fscache_operation op;
-	struct fscache_cookie	*cookie;	/* The netfs cookie */
-	struct address_space	*mapping;	/* netfs pages */
-	fscache_rw_complete_t	end_io_func;	/* function to call on I/O completion */
-	void			*context;	/* netfs read context (pinned) */
-	struct list_head	to_do;		/* list of things to be done by the backend */
-	atomic_t		n_pages;	/* number of pages to be retrieved */
-};
-
-typedef int (*fscache_page_retrieval_func_t)(struct fscache_retrieval *op,
-					     struct page *page,
-					     gfp_t gfp);
-
-typedef int (*fscache_pages_retrieval_func_t)(struct fscache_retrieval *op,
-					      struct list_head *pages,
-					      unsigned *nr_pages,
-					      gfp_t gfp);
-
-/**
- * fscache_get_retrieval - Get an extra reference on a retrieval operation
- * @op: The retrieval operation to get a reference on
- *
- * Get an extra reference on a retrieval operation.
- */
-static inline
-struct fscache_retrieval *fscache_get_retrieval(struct fscache_retrieval *op)
-{
-	atomic_inc(&op->op.usage);
-	return op;
-}
-
-/**
- * fscache_enqueue_retrieval - Enqueue a retrieval operation for processing
- * @op: The retrieval operation affected
- *
- * Enqueue a retrieval operation for processing by the FS-Cache thread pool.
- */
-static inline void fscache_enqueue_retrieval(struct fscache_retrieval *op)
-{
-	fscache_enqueue_operation(&op->op);
-}
-
-/**
- * fscache_retrieval_complete - Record (partial) completion of a retrieval
- * @op: The retrieval operation affected
- * @n_pages: The number of pages to account for
- */
-static inline void fscache_retrieval_complete(struct fscache_retrieval *op,
-					      int n_pages)
-{
-	if (atomic_sub_return_relaxed(n_pages, &op->n_pages) <= 0)
-		fscache_op_complete(&op->op, false);
-}
-
-/**
- * fscache_put_retrieval - Drop a reference to a retrieval operation
- * @op: The retrieval operation affected
- *
- * Drop a reference to a retrieval operation.
- */
-static inline void fscache_put_retrieval(struct fscache_retrieval *op)
-{
-	fscache_put_operation(&op->op);
-}
-
-/*
- * cached page storage work item
- * - used to do three things:
- *   - batch writes to the cache
- *   - do cache writes asynchronously
- *   - defer writes until cache object lookup completion
- */
-struct fscache_storage {
-	struct fscache_operation op;
-	pgoff_t			store_limit;	/* don't write more than this */
-};
-
-/*
- * cache operations
- */
-struct fscache_cache_ops {
-	/* name of cache provider */
-	const char *name;
-
-	/* allocate an object record for a cookie */
-	struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
-					       struct fscache_cookie *cookie);
-
-	/* look up the object for a cookie
-	 * - return -ETIMEDOUT to be requeued
-	 */
-	int (*lookup_object)(struct fscache_object *object);
-
-	/* finished looking up */
-	void (*lookup_complete)(struct fscache_object *object);
-
-	/* increment the usage count on this object (may fail if unmounting) */
-	struct fscache_object *(*grab_object)(struct fscache_object *object,
-					      enum fscache_obj_ref_trace why);
-
-	/* pin an object in the cache */
-	int (*pin_object)(struct fscache_object *object);
-
-	/* unpin an object in the cache */
-	void (*unpin_object)(struct fscache_object *object);
-
-	/* check the consistency between the backing cache and the FS-Cache
-	 * cookie */
-	int (*check_consistency)(struct fscache_operation *op);
-
-	/* store the updated auxiliary data on an object */
-	void (*update_object)(struct fscache_object *object);
-
-	/* Invalidate an object */
-	void (*invalidate_object)(struct fscache_operation *op);
-
-	/* discard the resources pinned by an object and effect retirement if
-	 * necessary */
-	void (*drop_object)(struct fscache_object *object);
-
-	/* dispose of a reference to an object */
-	void (*put_object)(struct fscache_object *object,
-			   enum fscache_obj_ref_trace why);
-
-	/* sync a cache */
-	void (*sync_cache)(struct fscache_cache *cache);
-
-	/* notification that the attributes of a non-index object (such as
-	 * i_size) have changed */
-	int (*attr_changed)(struct fscache_object *object);
-
-	/* reserve space for an object's data and associated metadata */
-	int (*reserve_space)(struct fscache_object *object, loff_t i_size);
-
-	/* request a backing block for a page be read or allocated in the
-	 * cache */
-	fscache_page_retrieval_func_t read_or_alloc_page;
-
-	/* request backing blocks for a list of pages be read or allocated in
-	 * the cache */
-	fscache_pages_retrieval_func_t read_or_alloc_pages;
-
-	/* request a backing block for a page be allocated in the cache so that
-	 * it can be written directly */
-	fscache_page_retrieval_func_t allocate_page;
-
-	/* request backing blocks for pages be allocated in the cache so that
-	 * they can be written directly */
-	fscache_pages_retrieval_func_t allocate_pages;
-
-	/* write a page to its backing block in the cache */
-	int (*write_page)(struct fscache_storage *op, struct page *page);
-
-	/* detach backing block from a page (optional)
-	 * - must release the cookie lock before returning
-	 * - may sleep
-	 */
-	void (*uncache_page)(struct fscache_object *object,
-			     struct page *page);
-
-	/* dissociate a cache from all the pages it was backing */
-	void (*dissociate_pages)(struct fscache_cache *cache);
-
-	/* Begin a read operation for the netfs lib */
-	int (*begin_read_operation)(struct netfs_read_request *rreq,
-				    struct fscache_retrieval *op);
-};
-
-extern struct fscache_cookie fscache_fsdef_index;
-
-/*
- * Event list for fscache_object::{event_mask,events}
- */
-enum {
-	FSCACHE_OBJECT_EV_NEW_CHILD,	/* T if object has a new child */
-	FSCACHE_OBJECT_EV_PARENT_READY,	/* T if object's parent is ready */
-	FSCACHE_OBJECT_EV_UPDATE,	/* T if object should be updated */
-	FSCACHE_OBJECT_EV_INVALIDATE,	/* T if cache requested object invalidation */
-	FSCACHE_OBJECT_EV_CLEARED,	/* T if accessors all gone */
-	FSCACHE_OBJECT_EV_ERROR,	/* T if fatal error occurred during processing */
-	FSCACHE_OBJECT_EV_KILL,		/* T if netfs relinquished or cache withdrew object */
-	NR_FSCACHE_OBJECT_EVENTS
-};
-
-#define FSCACHE_OBJECT_EVENTS_MASK ((1UL << NR_FSCACHE_OBJECT_EVENTS) - 1)
-
-/*
- * States for object state machine.
- */
-struct fscache_transition {
-	unsigned long events;
-	const struct fscache_state *transit_to;
-};
-
-struct fscache_state {
-	char name[24];
-	char short_name[8];
-	const struct fscache_state *(*work)(struct fscache_object *object,
-					    int event);
-	const struct fscache_transition transitions[];
-};
-
-/*
- * on-disk cache file or index handle
- */
-struct fscache_object {
-	const struct fscache_state *state;	/* Object state machine state */
-	const struct fscache_transition *oob_table; /* OOB state transition table */
-	int			debug_id;	/* debugging ID */
-	int			n_children;	/* number of child objects */
-	int			n_ops;		/* number of extant ops on object */
-	int			n_obj_ops;	/* number of object ops outstanding on object */
-	int			n_in_progress;	/* number of ops in progress */
-	int			n_exclusive;	/* number of exclusive ops queued or in progress */
-	atomic_t		n_reads;	/* number of read ops in progress */
-	spinlock_t		lock;		/* state and operations lock */
-
-	unsigned long		lookup_jif;	/* time at which lookup started */
-	unsigned long		oob_event_mask;	/* OOB events this object is interested in */
-	unsigned long		event_mask;	/* events this object is interested in */
-	unsigned long		events;		/* events to be processed by this object
-						 * (order is important - using fls) */
-
-	unsigned long		flags;
-#define FSCACHE_OBJECT_LOCK		0	/* T if object is busy being processed */
-#define FSCACHE_OBJECT_PENDING_WRITE	1	/* T if object has pending write */
-#define FSCACHE_OBJECT_WAITING		2	/* T if object is waiting on its parent */
-#define FSCACHE_OBJECT_IS_LIVE		3	/* T if object is not withdrawn or relinquished */
-#define FSCACHE_OBJECT_IS_LOOKED_UP	4	/* T if object has been looked up */
-#define FSCACHE_OBJECT_IS_AVAILABLE	5	/* T if object has become active */
-#define FSCACHE_OBJECT_RETIRED		6	/* T if object was retired on relinquishment */
-#define FSCACHE_OBJECT_KILLED_BY_CACHE	7	/* T if object was killed by the cache */
-#define FSCACHE_OBJECT_RUN_AFTER_DEAD	8	/* T if object has been dispatched after death */
-
-	struct list_head	cache_link;	/* link in cache->object_list */
-	struct hlist_node	cookie_link;	/* link in cookie->backing_objects */
-	struct fscache_cache	*cache;		/* cache that supplied this object */
-	struct fscache_cookie	*cookie;	/* netfs's file/index object */
-	struct fscache_object	*parent;	/* parent object */
-	struct work_struct	work;		/* attention scheduling record */
-	struct list_head	dependents;	/* FIFO of dependent objects */
-	struct list_head	dep_link;	/* link in parent's dependents list */
-	struct list_head	pending_ops;	/* unstarted operations on this object */
-	pgoff_t			store_limit;	/* current storage limit */
-	loff_t			store_limit_l;	/* current storage limit */
-};
-
-extern void fscache_object_init(struct fscache_object *, struct fscache_cookie *,
-				struct fscache_cache *);
-extern void fscache_object_destroy(struct fscache_object *);
-
-extern void fscache_object_lookup_negative(struct fscache_object *object);
-extern void fscache_obtained_object(struct fscache_object *object);
-
-static inline bool fscache_object_is_live(struct fscache_object *object)
-{
-	return test_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
-}
-
-static inline bool fscache_object_is_dying(struct fscache_object *object)
-{
-	return !fscache_object_is_live(object);
-}
-
-static inline bool fscache_object_is_available(struct fscache_object *object)
-{
-	return test_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
-}
-
-static inline bool fscache_cache_is_broken(struct fscache_object *object)
-{
-	return test_bit(FSCACHE_IOERROR, &object->cache->flags);
-}
-
-static inline bool fscache_object_is_active(struct fscache_object *object)
-{
-	return fscache_object_is_available(object) &&
-		fscache_object_is_live(object) &&
-		!fscache_cache_is_broken(object);
-}
-
-/**
- * fscache_object_destroyed - Note destruction of an object in a cache
- * @cache: The cache from which the object came
- *
- * Note the destruction and deallocation of an object record in a cache.
- */
-static inline void fscache_object_destroyed(struct fscache_cache *cache)
-{
-	if (atomic_dec_and_test(&cache->object_count))
-		wake_up_all(&fscache_cache_cleared_wq);
-}
-
-/**
- * fscache_object_lookup_error - Note an object encountered an error
- * @object: The object on which the error was encountered
- *
- * Note that an object encountered a fatal error (usually an I/O error) and
- * that it should be withdrawn as soon as possible.
- */
-static inline void fscache_object_lookup_error(struct fscache_object *object)
-{
-	set_bit(FSCACHE_OBJECT_EV_ERROR, &object->events);
-}
-
-/**
- * fscache_set_store_limit - Set the maximum size to be stored in an object
- * @object: The object to set the maximum on
- * @i_size: The limit to set in bytes
- *
- * Set the maximum size an object is permitted to reach, implying the highest
- * byte that may be written.  Intended to be called by the attr_changed() op.
- *
- * See Documentation/filesystems/caching/backend-api.rst for a complete
- * description.
- */
-static inline
-void fscache_set_store_limit(struct fscache_object *object, loff_t i_size)
-{
-	object->store_limit_l = i_size;
-	object->store_limit = i_size >> PAGE_SHIFT;
-	if (i_size & ~PAGE_MASK)
-		object->store_limit++;
-}
-
-/**
- * fscache_end_io - End a retrieval operation on a page
- * @op: The FS-Cache operation covering the retrieval
- * @page: The page that was to be fetched
- * @error: The error code (0 if successful)
- *
- * Note the end of an operation to retrieve a page, as covered by a particular
- * operation record.
- */
-static inline void fscache_end_io(struct fscache_retrieval *op,
-				  struct page *page, int error)
-{
-	op->end_io_func(page, op->context, error);
-}
-
-static inline void __fscache_use_cookie(struct fscache_cookie *cookie)
-{
-	atomic_inc(&cookie->n_active);
-}
-
-/**
- * fscache_use_cookie - Request usage of cookie attached to an object
- * @object: Object description
- * 
- * Request usage of the cookie attached to an object.  NULL is returned if the
- * relinquishment had reduced the cookie usage count to 0.
- */
-static inline bool fscache_use_cookie(struct fscache_object *object)
-{
-	struct fscache_cookie *cookie = object->cookie;
-	return atomic_inc_not_zero(&cookie->n_active) != 0;
-}
-
-static inline bool __fscache_unuse_cookie(struct fscache_cookie *cookie)
-{
-	return atomic_dec_and_test(&cookie->n_active);
-}
-
-static inline void __fscache_wake_unused_cookie(struct fscache_cookie *cookie)
-{
-	wake_up_var(&cookie->n_active);
-}
-
-/**
- * fscache_unuse_cookie - Cease usage of cookie attached to an object
- * @object: Object description
- * 
- * Cease usage of the cookie attached to an object.  When the users count
- * reaches zero then the cookie relinquishment will be permitted to proceed.
- */
-static inline void fscache_unuse_cookie(struct fscache_object *object)
-{
-	struct fscache_cookie *cookie = object->cookie;
-	if (__fscache_unuse_cookie(cookie))
-		__fscache_wake_unused_cookie(cookie);
-}
-
-/*
- * out-of-line cache backend functions
- */
-extern __printf(3, 4)
-void fscache_init_cache(struct fscache_cache *cache,
-			const struct fscache_cache_ops *ops,
-			const char *idfmt, ...);
-
-extern int fscache_add_cache(struct fscache_cache *cache,
-			     struct fscache_object *fsdef,
-			     const char *tagname);
-extern void fscache_withdraw_cache(struct fscache_cache *cache);
-
-extern void fscache_io_error(struct fscache_cache *cache);
-
-extern void fscache_mark_page_cached(struct fscache_retrieval *op,
-				     struct page *page);
-
-extern void fscache_mark_pages_cached(struct fscache_retrieval *op,
-				      struct pagevec *pagevec);
-
-extern bool fscache_object_sleep_till_congested(signed long *timeoutp);
-
-extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
-					       const void *data,
-					       uint16_t datalen,
-					       loff_t object_size);
-
-extern void fscache_object_retrying_stale(struct fscache_object *object);
-
-enum fscache_why_object_killed {
-	FSCACHE_OBJECT_IS_STALE,
-	FSCACHE_OBJECT_NO_SPACE,
-	FSCACHE_OBJECT_WAS_RETIRED,
-	FSCACHE_OBJECT_WAS_CULLED,
-};
-extern void fscache_object_mark_killed(struct fscache_object *object,
-				       enum fscache_why_object_killed why);
 
 #endif /* _LINUX_FSCACHE_CACHE_H */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 3b2282c157f7..0364a4ca16f6 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -15,861 +15,14 @@
 #define _LINUX_FSCACHE_H
 
 #include <linux/fs.h>
-#include <linux/list.h>
-#include <linux/pagemap.h>
-#include <linux/pagevec.h>
-#include <linux/list_bl.h>
 #include <linux/netfs.h>
 
 #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE)
-#define fscache_available() (1)
 #define fscache_cookie_valid(cookie) (cookie)
+#define fscache_cookie_enabled(cookie) (cookie)
 #else
-#define fscache_available() (0)
 #define fscache_cookie_valid(cookie) (0)
+#define fscache_cookie_enabled(cookie) (0)
 #endif
 
-
-/* pattern used to fill dead space in an index entry */
-#define FSCACHE_INDEX_DEADFILL_PATTERN 0x79
-
-struct pagevec;
-struct fscache_cache_tag;
-struct fscache_cookie;
-struct fscache_netfs;
-struct netfs_read_request;
-
-typedef void (*fscache_rw_complete_t)(struct page *page,
-				      void *context,
-				      int error);
-
-/* result of index entry consultation */
-enum fscache_checkaux {
-	FSCACHE_CHECKAUX_OKAY,		/* entry okay as is */
-	FSCACHE_CHECKAUX_NEEDS_UPDATE,	/* entry requires update */
-	FSCACHE_CHECKAUX_OBSOLETE,	/* entry requires deletion */
-};
-
-/*
- * fscache cookie definition
- */
-struct fscache_cookie_def {
-	/* name of cookie type */
-	char name[16];
-
-	/* cookie type */
-	uint8_t type;
-#define FSCACHE_COOKIE_TYPE_INDEX	0
-#define FSCACHE_COOKIE_TYPE_DATAFILE	1
-
-	/* select the cache into which to insert an entry in this index
-	 * - optional
-	 * - should return a cache identifier or NULL to cause the cache to be
-	 *   inherited from the parent if possible or the first cache picked
-	 *   for a non-index file if not
-	 */
-	struct fscache_cache_tag *(*select_cache)(
-		const void *parent_netfs_data,
-		const void *cookie_netfs_data);
-
-	/* consult the netfs about the state of an object
-	 * - this function can be absent if the index carries no state data
-	 * - the netfs data from the cookie being used as the target is
-	 *   presented, as is the auxiliary data and the object size
-	 */
-	enum fscache_checkaux (*check_aux)(void *cookie_netfs_data,
-					   const void *data,
-					   uint16_t datalen,
-					   loff_t object_size);
-
-	/* get an extra reference on a read context
-	 * - this function can be absent if the completion function doesn't
-	 *   require a context
-	 */
-	void (*get_context)(void *cookie_netfs_data, void *context);
-
-	/* release an extra reference on a read context
-	 * - this function can be absent if the completion function doesn't
-	 *   require a context
-	 */
-	void (*put_context)(void *cookie_netfs_data, void *context);
-
-	/* indicate page that now have cache metadata retained
-	 * - this function should mark the specified page as now being cached
-	 * - the page will have been marked with PG_fscache before this is
-	 *   called, so this is optional
-	 */
-	void (*mark_page_cached)(void *cookie_netfs_data,
-				 struct address_space *mapping,
-				 struct page *page);
-};
-
-/*
- * fscache cached network filesystem type
- * - name, version and ops must be filled in before registration
- * - all other fields will be set during registration
- */
-struct fscache_netfs {
-	uint32_t			version;	/* indexing version */
-	const char			*name;		/* filesystem name */
-	struct fscache_cookie		*primary_index;
-};
-
-/*
- * data file or index object cookie
- * - a file will only appear in one cache
- * - a request to cache a file may or may not be honoured, subject to
- *   constraints such as disk space
- * - indices are created on disk just-in-time
- */
-struct fscache_cookie {
-	refcount_t			ref;		/* number of users of this cookie */
-	atomic_t			n_children;	/* number of children of this cookie */
-	atomic_t			n_active;	/* number of active users of netfs ptrs */
-	unsigned int			debug_id;
-	spinlock_t			lock;
-	spinlock_t			stores_lock;	/* lock on page store tree */
-	struct hlist_head		backing_objects; /* object(s) backing this file/index */
-	const struct fscache_cookie_def	*def;		/* definition */
-	struct fscache_cookie		*parent;	/* parent of this entry */
-	struct hlist_bl_node		hash_link;	/* Link in hash table */
-	struct list_head		proc_link;	/* Link in proc list */
-	void				*netfs_data;	/* back pointer to netfs */
-	struct radix_tree_root		stores;		/* pages to be stored on this cookie */
-#define FSCACHE_COOKIE_PENDING_TAG	0		/* pages tag: pending write to cache */
-#define FSCACHE_COOKIE_STORING_TAG	1		/* pages tag: writing to cache */
-
-	unsigned long			flags;
-#define FSCACHE_COOKIE_LOOKING_UP	0	/* T if non-index cookie being looked up still */
-#define FSCACHE_COOKIE_NO_DATA_YET	1	/* T if new object with no cached data yet */
-#define FSCACHE_COOKIE_UNAVAILABLE	2	/* T if cookie is unavailable (error, etc) */
-#define FSCACHE_COOKIE_INVALIDATING	3	/* T if cookie is being invalidated */
-#define FSCACHE_COOKIE_RELINQUISHED	4	/* T if cookie has been relinquished */
-#define FSCACHE_COOKIE_ENABLED		5	/* T if cookie is enabled */
-#define FSCACHE_COOKIE_ENABLEMENT_LOCK	6	/* T if cookie is being en/disabled */
-#define FSCACHE_COOKIE_AUX_UPDATED	8	/* T if the auxiliary data was updated */
-#define FSCACHE_COOKIE_ACQUIRED		9	/* T if cookie is in use */
-#define FSCACHE_COOKIE_RELINQUISHING	10	/* T if cookie is being relinquished */
-
-	u8				type;		/* Type of object */
-	u8				key_len;	/* Length of index key */
-	u8				aux_len;	/* Length of auxiliary data */
-	u32				key_hash;	/* Hash of parent, type, key, len */
-	union {
-		void			*key;		/* Index key */
-		u8			inline_key[16];	/* - If the key is short enough */
-	};
-	union {
-		void			*aux;		/* Auxiliary data */
-		u8			inline_aux[8];	/* - If the aux data is short enough */
-	};
-};
-
-static inline bool fscache_cookie_enabled(struct fscache_cookie *cookie)
-{
-	return fscache_cookie_valid(cookie) && test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
-}
-
-/*
- * slow-path functions for when there is actually caching available, and the
- * netfs does actually have a valid token
- * - these are not to be called directly
- * - these are undefined symbols when FS-Cache is not configured and the
- *   optimiser takes care of not using them
- */
-extern int __fscache_register_netfs(struct fscache_netfs *);
-extern void __fscache_unregister_netfs(struct fscache_netfs *);
-extern struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *);
-extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
-
-extern struct fscache_cookie *__fscache_acquire_cookie(
-	struct fscache_cookie *,
-	const struct fscache_cookie_def *,
-	const void *, size_t,
-	const void *, size_t,
-	void *, loff_t, bool);
-extern void __fscache_relinquish_cookie(struct fscache_cookie *, const void *, bool);
-extern int __fscache_check_consistency(struct fscache_cookie *, const void *);
-extern void __fscache_update_cookie(struct fscache_cookie *, const void *);
-extern int __fscache_attr_changed(struct fscache_cookie *);
-extern void __fscache_invalidate(struct fscache_cookie *);
-extern void __fscache_wait_on_invalidate(struct fscache_cookie *);
-
-#ifdef FSCACHE_USE_NEW_IO_API
-extern int __fscache_begin_read_operation(struct netfs_read_request *, struct fscache_cookie *);
-#else
-extern int __fscache_read_or_alloc_page(struct fscache_cookie *,
-					struct page *,
-					fscache_rw_complete_t,
-					void *,
-					gfp_t);
-extern int __fscache_read_or_alloc_pages(struct fscache_cookie *,
-					 struct address_space *,
-					 struct list_head *,
-					 unsigned *,
-					 fscache_rw_complete_t,
-					 void *,
-					 gfp_t);
-extern int __fscache_alloc_page(struct fscache_cookie *, struct page *, gfp_t);
-extern int __fscache_write_page(struct fscache_cookie *, struct page *, loff_t, gfp_t);
-extern void __fscache_uncache_page(struct fscache_cookie *, struct page *);
-extern bool __fscache_check_page_write(struct fscache_cookie *, struct page *);
-extern void __fscache_wait_on_page_write(struct fscache_cookie *, struct page *);
-extern bool __fscache_maybe_release_page(struct fscache_cookie *, struct page *,
-					 gfp_t);
-extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *,
-					      struct inode *);
-extern void __fscache_readpages_cancel(struct fscache_cookie *cookie,
-				       struct list_head *pages);
-#endif /* FSCACHE_USE_NEW_IO_API */
-
-extern void __fscache_disable_cookie(struct fscache_cookie *, const void *, bool);
-extern void __fscache_enable_cookie(struct fscache_cookie *, const void *, loff_t,
-				    bool (*)(void *), void *);
-
-/**
- * fscache_register_netfs - Register a filesystem as desiring caching services
- * @netfs: The description of the filesystem
- *
- * Register a filesystem as desiring caching services if they're available.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-int fscache_register_netfs(struct fscache_netfs *netfs)
-{
-	if (fscache_available())
-		return __fscache_register_netfs(netfs);
-	else
-		return 0;
-}
-
-/**
- * fscache_unregister_netfs - Indicate that a filesystem no longer desires
- * caching services
- * @netfs: The description of the filesystem
- *
- * Indicate that a filesystem no longer desires caching services for the
- * moment.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-void fscache_unregister_netfs(struct fscache_netfs *netfs)
-{
-	if (fscache_available())
-		__fscache_unregister_netfs(netfs);
-}
-
-/**
- * fscache_lookup_cache_tag - Look up a cache tag
- * @name: The name of the tag to search for
- *
- * Acquire a specific cache referral tag that can be used to select a specific
- * cache in which to cache an index.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name)
-{
-	if (fscache_available())
-		return __fscache_lookup_cache_tag(name);
-	else
-		return NULL;
-}
-
-/**
- * fscache_release_cache_tag - Release a cache tag
- * @tag: The tag to release
- *
- * Release a reference to a cache referral tag previously looked up.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-void fscache_release_cache_tag(struct fscache_cache_tag *tag)
-{
-	if (fscache_available())
-		__fscache_release_cache_tag(tag);
-}
-
-/**
- * fscache_acquire_cookie - Acquire a cookie to represent a cache object
- * @parent: The cookie that's to be the parent of this one
- * @def: A description of the cache object, including callback operations
- * @index_key: The index key for this cookie
- * @index_key_len: Size of the index key
- * @aux_data: The auxiliary data for the cookie (may be NULL)
- * @aux_data_len: Size of the auxiliary data buffer
- * @netfs_data: An arbitrary piece of data to be kept in the cookie to
- * represent the cache object to the netfs
- * @object_size: The initial size of object
- * @enable: Whether or not to enable a data cookie immediately
- *
- * This function is used to inform FS-Cache about part of an index hierarchy
- * that can be used to locate files.  This is done by requesting a cookie for
- * each index in the path to the file.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-struct fscache_cookie *fscache_acquire_cookie(
-	struct fscache_cookie *parent,
-	const struct fscache_cookie_def *def,
-	const void *index_key,
-	size_t index_key_len,
-	const void *aux_data,
-	size_t aux_data_len,
-	void *netfs_data,
-	loff_t object_size,
-	bool enable)
-{
-	if (fscache_cookie_valid(parent) && fscache_cookie_enabled(parent))
-		return __fscache_acquire_cookie(parent, def,
-						index_key, index_key_len,
-						aux_data, aux_data_len,
-						netfs_data, object_size, enable);
-	else
-		return NULL;
-}
-
-/**
- * fscache_relinquish_cookie - Return the cookie to the cache, maybe discarding
- * it
- * @cookie: The cookie being returned
- * @aux_data: The updated auxiliary data for the cookie (may be NULL)
- * @retire: True if the cache object the cookie represents is to be discarded
- *
- * This function returns a cookie to the cache, forcibly discarding the
- * associated cache object if retire is set to true.  The opportunity is
- * provided to update the auxiliary data in the cache before the object is
- * disconnected.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-void fscache_relinquish_cookie(struct fscache_cookie *cookie,
-			       const void *aux_data,
-			       bool retire)
-{
-	if (fscache_cookie_valid(cookie))
-		__fscache_relinquish_cookie(cookie, aux_data, retire);
-}
-
-/**
- * fscache_check_consistency - Request validation of a cache's auxiliary data
- * @cookie: The cookie representing the cache object
- * @aux_data: The updated auxiliary data for the cookie (may be NULL)
- *
- * Request an consistency check from fscache, which passes the request to the
- * backing cache.  The auxiliary data on the cookie will be updated first if
- * @aux_data is set.
- *
- * Returns 0 if consistent and -ESTALE if inconsistent.  May also
- * return -ENOMEM and -ERESTARTSYS.
- */
-static inline
-int fscache_check_consistency(struct fscache_cookie *cookie,
-			      const void *aux_data)
-{
-	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
-		return __fscache_check_consistency(cookie, aux_data);
-	else
-		return 0;
-}
-
-/**
- * fscache_update_cookie - Request that a cache object be updated
- * @cookie: The cookie representing the cache object
- * @aux_data: The updated auxiliary data for the cookie (may be NULL)
- *
- * Request an update of the index data for the cache object associated with the
- * cookie.  The auxiliary data on the cookie will be updated first if @aux_data
- * is set.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data)
-{
-	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
-		__fscache_update_cookie(cookie, aux_data);
-}
-
-/**
- * fscache_pin_cookie - Pin a data-storage cache object in its cache
- * @cookie: The cookie representing the cache object
- *
- * Permit data-storage cache objects to be pinned in the cache.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-int fscache_pin_cookie(struct fscache_cookie *cookie)
-{
-	return -ENOBUFS;
-}
-
-/**
- * fscache_pin_cookie - Unpin a data-storage cache object in its cache
- * @cookie: The cookie representing the cache object
- *
- * Permit data-storage cache objects to be unpinned from the cache.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-void fscache_unpin_cookie(struct fscache_cookie *cookie)
-{
-}
-
-/**
- * fscache_attr_changed - Notify cache that an object's attributes changed
- * @cookie: The cookie representing the cache object
- *
- * Send a notification to the cache indicating that an object's attributes have
- * changed.  This includes the data size.  These attributes will be obtained
- * through the get_attr() cookie definition op.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-int fscache_attr_changed(struct fscache_cookie *cookie)
-{
-	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
-		return __fscache_attr_changed(cookie);
-	else
-		return -ENOBUFS;
-}
-
-/**
- * fscache_invalidate - Notify cache that an object needs invalidation
- * @cookie: The cookie representing the cache object
- *
- * Notify the cache that an object is needs to be invalidated and that it
- * should abort any retrievals or stores it is doing on the cache.  The object
- * is then marked non-caching until such time as the invalidation is complete.
- *
- * This can be called with spinlocks held.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-void fscache_invalidate(struct fscache_cookie *cookie)
-{
-	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
-		__fscache_invalidate(cookie);
-}
-
-/**
- * fscache_wait_on_invalidate - Wait for invalidation to complete
- * @cookie: The cookie representing the cache object
- *
- * Wait for the invalidation of an object to complete.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-void fscache_wait_on_invalidate(struct fscache_cookie *cookie)
-{
-	if (fscache_cookie_valid(cookie))
-		__fscache_wait_on_invalidate(cookie);
-}
-
-/**
- * fscache_reserve_space - Reserve data space for a cached object
- * @cookie: The cookie representing the cache object
- * @i_size: The amount of space to be reserved
- *
- * Reserve an amount of space in the cache for the cache object attached to a
- * cookie so that a write to that object within the space can always be
- * honoured.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size)
-{
-	return -ENOBUFS;
-}
-
-#ifdef FSCACHE_USE_NEW_IO_API
-
-/**
- * fscache_begin_read_operation - Begin a read operation for the netfs lib
- * @rreq: The read request being undertaken
- * @cookie: The cookie representing the cache object
- *
- * Begin a read operation on behalf of the netfs helper library.  @rreq
- * indicates the read request to which the operation state should be attached;
- * @cookie indicates the cache object that will be accessed.
- *
- * This is intended to be called from the ->begin_cache_operation() netfs lib
- * operation as implemented by the network filesystem.
- *
- * Returns:
- * * 0		- Success
- * * -ENOBUFS	- No caching available
- * * Other error code from the cache, such as -ENOMEM.
- */
-static inline
-int fscache_begin_read_operation(struct netfs_read_request *rreq,
-				 struct fscache_cookie *cookie)
-{
-	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
-		return __fscache_begin_read_operation(rreq, cookie);
-	return -ENOBUFS;
-}
-
-#else /* FSCACHE_USE_NEW_IO_API */
-
-/**
- * fscache_read_or_alloc_page - Read a page from the cache or allocate a block
- * in which to store it
- * @cookie: The cookie representing the cache object
- * @page: The netfs page to fill if possible
- * @end_io_func: The callback to invoke when and if the page is filled
- * @context: An arbitrary piece of data to pass on to end_io_func()
- * @gfp: The conditions under which memory allocation should be made
- *
- * Read a page from the cache, or if that's not possible make a potential
- * one-block reservation in the cache into which the page may be stored once
- * fetched from the server.
- *
- * If the page is not backed by the cache object, or if it there's some reason
- * it can't be, -ENOBUFS will be returned and nothing more will be done for
- * that page.
- *
- * Else, if that page is backed by the cache, a read will be initiated directly
- * to the netfs's page and 0 will be returned by this function.  The
- * end_io_func() callback will be invoked when the operation terminates on a
- * completion or failure.  Note that the callback may be invoked before the
- * return.
- *
- * Else, if the page is unbacked, -ENODATA is returned and a block may have
- * been allocated in the cache.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
-			       struct page *page,
-			       fscache_rw_complete_t end_io_func,
-			       void *context,
-			       gfp_t gfp)
-{
-	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
-		return __fscache_read_or_alloc_page(cookie, page, end_io_func,
-						    context, gfp);
-	else
-		return -ENOBUFS;
-}
-
-/**
- * fscache_read_or_alloc_pages - Read pages from the cache and/or allocate
- * blocks in which to store them
- * @cookie: The cookie representing the cache object
- * @mapping: The netfs inode mapping to which the pages will be attached
- * @pages: A list of potential netfs pages to be filled
- * @nr_pages: Number of pages to be read and/or allocated
- * @end_io_func: The callback to invoke when and if each page is filled
- * @context: An arbitrary piece of data to pass on to end_io_func()
- * @gfp: The conditions under which memory allocation should be made
- *
- * Read a set of pages from the cache, or if that's not possible, attempt to
- * make a potential one-block reservation for each page in the cache into which
- * that page may be stored once fetched from the server.
- *
- * If some pages are not backed by the cache object, or if it there's some
- * reason they can't be, -ENOBUFS will be returned and nothing more will be
- * done for that pages.
- *
- * Else, if some of the pages are backed by the cache, a read will be initiated
- * directly to the netfs's page and 0 will be returned by this function.  The
- * end_io_func() callback will be invoked when the operation terminates on a
- * completion or failure.  Note that the callback may be invoked before the
- * return.
- *
- * Else, if a page is unbacked, -ENODATA is returned and a block may have
- * been allocated in the cache.
- *
- * Because the function may want to return all of -ENOBUFS, -ENODATA and 0 in
- * regard to different pages, the return values are prioritised in that order.
- * Any pages submitted for reading are removed from the pages list.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
-				struct address_space *mapping,
-				struct list_head *pages,
-				unsigned *nr_pages,
-				fscache_rw_complete_t end_io_func,
-				void *context,
-				gfp_t gfp)
-{
-	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
-		return __fscache_read_or_alloc_pages(cookie, mapping, pages,
-						     nr_pages, end_io_func,
-						     context, gfp);
-	else
-		return -ENOBUFS;
-}
-
-/**
- * fscache_alloc_page - Allocate a block in which to store a page
- * @cookie: The cookie representing the cache object
- * @page: The netfs page to allocate a page for
- * @gfp: The conditions under which memory allocation should be made
- *
- * Request Allocation a block in the cache in which to store a netfs page
- * without retrieving any contents from the cache.
- *
- * If the page is not backed by a file then -ENOBUFS will be returned and
- * nothing more will be done, and no reservation will be made.
- *
- * Else, a block will be allocated if one wasn't already, and 0 will be
- * returned
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-int fscache_alloc_page(struct fscache_cookie *cookie,
-		       struct page *page,
-		       gfp_t gfp)
-{
-	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
-		return __fscache_alloc_page(cookie, page, gfp);
-	else
-		return -ENOBUFS;
-}
-
-/**
- * fscache_readpages_cancel - Cancel read/alloc on pages
- * @cookie: The cookie representing the inode's cache object.
- * @pages: The netfs pages that we canceled write on in readpages()
- *
- * Uncache/unreserve the pages reserved earlier in readpages() via
- * fscache_readpages_or_alloc() and similar.  In most successful caches in
- * readpages() this doesn't do anything.  In cases when the underlying netfs's
- * readahead failed we need to clean up the pagelist (unmark and uncache).
- *
- * This function may sleep as it may have to clean up disk state.
- */
-static inline
-void fscache_readpages_cancel(struct fscache_cookie *cookie,
-			      struct list_head *pages)
-{
-	if (fscache_cookie_valid(cookie))
-		__fscache_readpages_cancel(cookie, pages);
-}
-
-/**
- * fscache_write_page - Request storage of a page in the cache
- * @cookie: The cookie representing the cache object
- * @page: The netfs page to store
- * @object_size: Updated size of object
- * @gfp: The conditions under which memory allocation should be made
- *
- * Request the contents of the netfs page be written into the cache.  This
- * request may be ignored if no cache block is currently allocated, in which
- * case it will return -ENOBUFS.
- *
- * If a cache block was already allocated, a write will be initiated and 0 will
- * be returned.  The PG_fscache_write page bit is set immediately and will then
- * be cleared at the completion of the write to indicate the success or failure
- * of the operation.  Note that the completion may happen before the return.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-int fscache_write_page(struct fscache_cookie *cookie,
-		       struct page *page,
-		       loff_t object_size,
-		       gfp_t gfp)
-{
-	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
-		return __fscache_write_page(cookie, page, object_size, gfp);
-	else
-		return -ENOBUFS;
-}
-
-/**
- * fscache_uncache_page - Indicate that caching is no longer required on a page
- * @cookie: The cookie representing the cache object
- * @page: The netfs page that was being cached.
- *
- * Tell the cache that we no longer want a page to be cached and that it should
- * remove any knowledge of the netfs page it may have.
- *
- * Note that this cannot cancel any outstanding I/O operations between this
- * page and the cache.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-void fscache_uncache_page(struct fscache_cookie *cookie,
-			  struct page *page)
-{
-	if (fscache_cookie_valid(cookie))
-		__fscache_uncache_page(cookie, page);
-}
-
-/**
- * fscache_check_page_write - Ask if a page is being writing to the cache
- * @cookie: The cookie representing the cache object
- * @page: The netfs page that is being cached.
- *
- * Ask the cache if a page is being written to the cache.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-bool fscache_check_page_write(struct fscache_cookie *cookie,
-			      struct page *page)
-{
-	if (fscache_cookie_valid(cookie))
-		return __fscache_check_page_write(cookie, page);
-	return false;
-}
-
-/**
- * fscache_wait_on_page_write - Wait for a page to complete writing to the cache
- * @cookie: The cookie representing the cache object
- * @page: The netfs page that is being cached.
- *
- * Ask the cache to wake us up when a page is no longer being written to the
- * cache.
- *
- * See Documentation/filesystems/caching/netfs-api.rst for a complete
- * description.
- */
-static inline
-void fscache_wait_on_page_write(struct fscache_cookie *cookie,
-				struct page *page)
-{
-	if (fscache_cookie_valid(cookie))
-		__fscache_wait_on_page_write(cookie, page);
-}
-
-/**
- * fscache_maybe_release_page - Consider releasing a page, cancelling a store
- * @cookie: The cookie representing the cache object
- * @page: The netfs page that is being cached.
- * @gfp: The gfp flags passed to releasepage()
- *
- * Consider releasing a page for the vmscan algorithm, on behalf of the netfs's
- * releasepage() call.  A storage request on the page may cancelled if it is
- * not currently being processed.
- *
- * The function returns true if the page no longer has a storage request on it,
- * and false if a storage request is left in place.  If true is returned, the
- * page will have been passed to fscache_uncache_page().  If false is returned
- * the page cannot be freed yet.
- */
-static inline
-bool fscache_maybe_release_page(struct fscache_cookie *cookie,
-				struct page *page,
-				gfp_t gfp)
-{
-	if (fscache_cookie_valid(cookie) && PageFsCache(page))
-		return __fscache_maybe_release_page(cookie, page, gfp);
-	return true;
-}
-
-/**
- * fscache_uncache_all_inode_pages - Uncache all an inode's pages
- * @cookie: The cookie representing the inode's cache object.
- * @inode: The inode to uncache pages from.
- *
- * Uncache all the pages in an inode that are marked PG_fscache, assuming them
- * to be associated with the given cookie.
- *
- * This function may sleep.  It will wait for pages that are being written out
- * and will wait whilst the PG_fscache mark is removed by the cache.
- */
-static inline
-void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
-				     struct inode *inode)
-{
-	if (fscache_cookie_valid(cookie))
-		__fscache_uncache_all_inode_pages(cookie, inode);
-}
-
-#endif /* FSCACHE_USE_NEW_IO_API */
-
-/**
- * fscache_disable_cookie - Disable a cookie
- * @cookie: The cookie representing the cache object
- * @aux_data: The updated auxiliary data for the cookie (may be NULL)
- * @invalidate: Invalidate the backing object
- *
- * Disable a cookie from accepting further alloc, read, write, invalidate,
- * update or acquire operations.  Outstanding operations can still be waited
- * upon and pages can still be uncached and the cookie relinquished.
- *
- * This will not return until all outstanding operations have completed.
- *
- * If @invalidate is set, then the backing object will be invalidated and
- * detached, otherwise it will just be detached.
- *
- * If @aux_data is set, then auxiliary data will be updated from that.
- */
-static inline
-void fscache_disable_cookie(struct fscache_cookie *cookie,
-			    const void *aux_data,
-			    bool invalidate)
-{
-	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
-		__fscache_disable_cookie(cookie, aux_data, invalidate);
-}
-
-/**
- * fscache_enable_cookie - Reenable a cookie
- * @cookie: The cookie representing the cache object
- * @aux_data: The updated auxiliary data for the cookie (may be NULL)
- * @object_size: Current size of object
- * @can_enable: A function to permit enablement once lock is held
- * @data: Data for can_enable()
- *
- * Reenable a previously disabled cookie, allowing it to accept further alloc,
- * read, write, invalidate, update or acquire operations.  An attempt will be
- * made to immediately reattach the cookie to a backing object.  If @aux_data
- * is set, the auxiliary data attached to the cookie will be updated.
- *
- * The can_enable() function is called (if not NULL) once the enablement lock
- * is held to rule on whether enablement is still permitted to go ahead.
- */
-static inline
-void fscache_enable_cookie(struct fscache_cookie *cookie,
-			   const void *aux_data,
-			   loff_t object_size,
-			   bool (*can_enable)(void *data),
-			   void *data)
-{
-	if (fscache_cookie_valid(cookie) && !fscache_cookie_enabled(cookie))
-		__fscache_enable_cookie(cookie, aux_data, object_size,
-					can_enable, data);
-}
-
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From a39c41b853ee51f4dcd19f5556f860ae8e2f23d3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 14:30:37 +0100
Subject: netfs: Pass a flag to ->prepare_write() to say if there's no alloc'd
 space

Pass a flag to ->prepare_write() to indicate if there's definitely no
space allocated in the cache yet (for instance if we've already checked as
we were asked to do a read).

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819583123.215744.12783808230464471417.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906886835.143852.6689886781122679769.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967079100.1823006.12889542712309574359.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021489334.640689.3131206613015409076.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/netfs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index ca0683b9e3d1..1ea22fc48818 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -232,7 +232,8 @@ struct netfs_cache_ops {
 	 * actually do.
 	 */
 	int (*prepare_write)(struct netfs_cache_resources *cres,
-			     loff_t *_start, size_t *_len, loff_t i_size);
+			     loff_t *_start, size_t *_len, loff_t i_size,
+			     bool no_space_allocated_yet);
 };
 
 struct readahead_control;
-- 
cgit v1.2.3


From 1e1236b841166f1d2daf36fdf6bb3e656bc5f5ca Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 14:34:41 +0100
Subject: fscache: Introduce new driver

Introduce basic skeleton of the new, rewritten fscache driver.

Changes
=======
ver #3:
 - Use remove_proc_subtree(), not remove_proc_entry() to remove a populated
   dir.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819584034.215744.4290533472390439030.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906887770.143852.3577888294989185666.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967080039.1823006.5702921801104057922.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021491014.640689.4292699878317589512.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 2 ++
 include/linux/fscache.h       | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 47f21a53ac4b..d6910a913918 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -16,4 +16,6 @@
 
 #include <linux/fscache.h>
 
+extern struct workqueue_struct *fscache_wq;
+
 #endif /* _LINUX_FSCACHE_CACHE_H */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 0364a4ca16f6..1cf90c252aac 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /* General filesystem caching interface
  *
- * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * NOTE!!! See:
@@ -18,9 +18,13 @@
 #include <linux/netfs.h>
 
 #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE)
+#define __fscache_available (1)
+#define fscache_available() (1)
 #define fscache_cookie_valid(cookie) (cookie)
 #define fscache_cookie_enabled(cookie) (cookie)
 #else
+#define __fscache_available (0)
+#define fscache_available() (0)
 #define fscache_cookie_valid(cookie) (0)
 #define fscache_cookie_enabled(cookie) (0)
 #endif
-- 
cgit v1.2.3


From 9549332df4ed4e761a1d41c83f2c25d28bb22431 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 15:00:26 +0100
Subject: fscache: Implement cache registration

Implement a register of caches and provide functions to manage it.

Two functions are provided for the cache backend to use:

 (1) Acquire a cache cookie:

	struct fscache_cache *fscache_acquire_cache(const char *name)

     This gets the cache cookie for a cache of the specified name and moves
     it to the preparation state.  If a nameless cache cookie exists, that
     will be given this name and used.

 (2) Relinquish a cache cookie:

	void fscache_relinquish_cache(struct fscache_cache *cache);

     This relinquishes a cache cookie, cleans it and makes it available if
     it's still referenced by a network filesystem.

Note that network filesystems don't deal with cache cookies directly, but
rather go straight to the volume registration.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819587157.215744.13523139317322503286.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906889665.143852.10378009165231294456.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967085081.1823006.2218944206363626210.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021494847.640689.10109692261640524343.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index d6910a913918..18cd5c9877bb 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -16,6 +16,40 @@
 
 #include <linux/fscache.h>
 
+enum fscache_cache_trace;
+enum fscache_access_trace;
+
+enum fscache_cache_state {
+	FSCACHE_CACHE_IS_NOT_PRESENT,	/* No cache is present for this name */
+	FSCACHE_CACHE_IS_PREPARING,	/* A cache is preparing to come live */
+	FSCACHE_CACHE_IS_ACTIVE,	/* Attached cache is active and can be used */
+	FSCACHE_CACHE_GOT_IOERROR,	/* Attached cache stopped on I/O error */
+	FSCACHE_CACHE_IS_WITHDRAWN,	/* Attached cache is being withdrawn */
+#define NR__FSCACHE_CACHE_STATE (FSCACHE_CACHE_IS_WITHDRAWN + 1)
+};
+
+/*
+ * Cache cookie.
+ */
+struct fscache_cache {
+	struct list_head	cache_link;	/* Link in cache list */
+	void			*cache_priv;	/* Private cache data (or NULL) */
+	refcount_t		ref;
+	atomic_t		n_volumes;	/* Number of active volumes; */
+	atomic_t		n_accesses;	/* Number of in-progress accesses on the cache */
+	atomic_t		object_count;	/* no. of live objects in this cache */
+	unsigned int		debug_id;
+	enum fscache_cache_state state;
+	char			*name;
+};
+
 extern struct workqueue_struct *fscache_wq;
 
+/*
+ * out-of-line cache backend functions
+ */
+extern struct rw_semaphore fscache_addremove_sem;
+extern struct fscache_cache *fscache_acquire_cache(const char *name);
+extern void fscache_relinquish_cache(struct fscache_cache *cache);
+
 #endif /* _LINUX_FSCACHE_CACHE_H */
-- 
cgit v1.2.3


From 62ab63352350e881ae693a8236b35d7d0516c78b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 15:26:17 +0100
Subject: fscache: Implement volume registration

Add functions to the fscache API to allow volumes to be acquired and
relinquished by the network filesystem.  A volume is an index of data
storage cache objects.  A volume is represented by a volume cookie in the
API.  A filesystem would typically create a volume for a superblock and
then create per-inode cookies within it.

To request a volume, the filesystem calls:

	struct fscache_volume *
	fscache_acquire_volume(const char *volume_key,
			       const char *cache_name,
			       const void *coherency_data,
			       size_t coherency_len)

The volume_key is a printable string used to match the volume in the cache.
It should not contain any '/' characters.  For AFS, for example, this would
be "afs,<cellname>,<volume_id>", e.g. "afs,example.com,523001".

The cache_name can be NULL, but if not it should be a string indicating the
name of the cache to use if there's more than one available.

The coherency data, if given, is an arbitrarily-sized blob that's attached
to the volume and is compared when the volume is looked up.  If it doesn't
match, the old volume is judged to be out of date and it and everything
within it is discarded.

Acquiring a volume twice concurrently is disallowed, though the function
will wait if an old volume cookie is being relinquishing.


When a network filesystem has finished with a volume, it should return the
volume cookie by calling:

	void
	fscache_relinquish_volume(struct fscache_volume *volume,
				  const void *coherency_data,
				  bool invalidate)

If invalidate is true, the entire volume will be discarded; if false, the
volume will be synced and the coherency data will be updated.

Changes
=======
ver #4:
 - Removed an extraneous param from kdoc on fscache_relinquish_volume()[3].

ver #3:
 - fscache_hash()'s size parameter is now in bytes.  Use __le32 as the unit
   to round up to.
 - When comparing cookies, simply see if the attributes are the same rather
   than subtracting them to produce a strcmp-style return[2].
 - Make the coherency data an arbitrary blob rather than a u64, but don't
   store it for the moment.

ver #2:
 - Fix error check[1].
 - Make a fscache_acquire_volume() return errors, including EBUSY if a
   conflicting volume cookie already exists.  No error is printed now -
   that's left to the netfs.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/20211203095608.GC2480@kili/ [1]
Link: https://lore.kernel.org/r/CAHk-=whtkzB446+hX0zdLsdcUJsJ=8_-0S1mE_R+YurThfUbLA@mail.gmail.com/ [2]
Link: https://lore.kernel.org/r/20211220224646.30e8205c@canb.auug.org.au/ [3]
Link: https://lore.kernel.org/r/163819588944.215744.1629085755564865996.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906890630.143852.13972180614535611154.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967086836.1823006.8191672796841981763.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021495816.640689.4403156093668590217.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache.h | 84 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 1cf90c252aac..131a741a6652 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -20,13 +20,97 @@
 #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE)
 #define __fscache_available (1)
 #define fscache_available() (1)
+#define fscache_volume_valid(volume) (volume)
 #define fscache_cookie_valid(cookie) (cookie)
 #define fscache_cookie_enabled(cookie) (cookie)
 #else
 #define __fscache_available (0)
 #define fscache_available() (0)
+#define fscache_volume_valid(volume) (0)
 #define fscache_cookie_valid(cookie) (0)
 #define fscache_cookie_enabled(cookie) (0)
 #endif
 
+/*
+ * Volume representation cookie.
+ */
+struct fscache_volume {
+	refcount_t			ref;
+	atomic_t			n_cookies;	/* Number of data cookies in volume */
+	atomic_t			n_accesses;	/* Number of cache accesses in progress */
+	unsigned int			debug_id;
+	unsigned int			key_hash;	/* Hash of key string */
+	char				*key;		/* Volume ID, eg. "afs@example.com@1234" */
+	struct list_head		proc_link;	/* Link in /proc/fs/fscache/volumes */
+	struct hlist_bl_node		hash_link;	/* Link in hash table */
+	struct work_struct		work;
+	struct fscache_cache		*cache;		/* The cache in which this resides */
+	void				*cache_priv;	/* Cache private data */
+	spinlock_t			lock;
+	unsigned long			flags;
+#define FSCACHE_VOLUME_RELINQUISHED	0	/* Volume is being cleaned up */
+#define FSCACHE_VOLUME_INVALIDATE	1	/* Volume was invalidated */
+#define FSCACHE_VOLUME_COLLIDED_WITH	2	/* Volume was collided with */
+#define FSCACHE_VOLUME_ACQUIRE_PENDING	3	/* Volume is waiting to complete acquisition */
+#define FSCACHE_VOLUME_CREATING		4	/* Volume is being created on disk */
+};
+
+/*
+ * slow-path functions for when there is actually caching available, and the
+ * netfs does actually have a valid token
+ * - these are not to be called directly
+ * - these are undefined symbols when FS-Cache is not configured and the
+ *   optimiser takes care of not using them
+ */
+extern struct fscache_volume *__fscache_acquire_volume(const char *, const char *,
+						       const void *, size_t);
+extern void __fscache_relinquish_volume(struct fscache_volume *, const void *, bool);
+
+/**
+ * fscache_acquire_volume - Register a volume as desiring caching services
+ * @volume_key: An identification string for the volume
+ * @cache_name: The name of the cache to use (or NULL for the default)
+ * @coherency_data: Piece of arbitrary coherency data to check (or NULL)
+ * @coherency_len: The size of the coherency data
+ *
+ * Register a volume as desiring caching services if they're available.  The
+ * caller must provide an identifier for the volume and may also indicate which
+ * cache it should be in.  If a preexisting volume entry is found in the cache,
+ * the coherency data must match otherwise the entry will be invalidated.
+ *
+ * Returns a cookie pointer on success, -ENOMEM if out of memory or -EBUSY if a
+ * cache volume of that name is already acquired.  Note that "NULL" is a valid
+ * cookie pointer and can be returned if caching is refused.
+ */
+static inline
+struct fscache_volume *fscache_acquire_volume(const char *volume_key,
+					      const char *cache_name,
+					      const void *coherency_data,
+					      size_t coherency_len)
+{
+	if (!fscache_available())
+		return NULL;
+	return __fscache_acquire_volume(volume_key, cache_name,
+					coherency_data, coherency_len);
+}
+
+/**
+ * fscache_relinquish_volume - Cease caching a volume
+ * @volume: The volume cookie
+ * @coherency_data: Piece of arbitrary coherency data to set (or NULL)
+ * @invalidate: True if the volume should be invalidated
+ *
+ * Indicate that a filesystem no longer desires caching services for a volume.
+ * The caller must have relinquished all file cookies prior to calling this.
+ * The stored coherency data is updated.
+ */
+static inline
+void fscache_relinquish_volume(struct fscache_volume *volume,
+			       const void *coherency_data,
+			       bool invalidate)
+{
+	if (fscache_volume_valid(volume))
+		__fscache_relinquish_volume(volume, coherency_data, invalidate);
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From 7f3283aba39a0f395700c3b5defa4ec49d9914b3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 15:53:34 +0100
Subject: fscache: Implement cookie registration

Add functions to the fscache API to allow data file cookies to be acquired
and relinquished by the network filesystem.  It is intended that the
filesystem will create such cookies per-inode under a volume.

To request a cookie, the filesystem should call:

	struct fscache_cookie *
	fscache_acquire_cookie(struct fscache_volume *volume,
			       u8 advice,
			       const void *index_key,
			       size_t index_key_len,
			       const void *aux_data,
			       size_t aux_data_len,
			       loff_t object_size)


The filesystem must first have created a volume cookie, which is passed in
here.  If it passes in NULL then the function will just return a NULL
cookie.

A binary key should be passed in index_key and is of size index_key_len.
This is saved in the cookie and is used to locate the associated data in
the cache.

A coherency data buffer of size aux_data_len will be allocated and
initialised from the buffer pointed to by aux_data.  This is used to
validate cache objects when they're opened and is stored on disk with them
when they're committed.  The data is stored in the cookie and will be
updateable by various functions in later patches.

The object_size must also be given.  This is also used to perform a
coherency check and to size the backing storage appropriately.

This function disallows a cookie from being acquired twice in parallel,
though it will cause the second user to wait if the first is busy
relinquishing its cookie.


When a network filesystem has finished with a cookie, it should call:

	void
	fscache_relinquish_cookie(struct fscache_volume *volume,
				  bool retire)

If retire is true, any backing data will be discarded immediately.

Changes
=======
ver #3:
 - fscache_hash()'s size parameter is now in bytes.  Use __le32 as the unit
   to round up to.
 - When comparing cookies, simply see if the attributes are the same rather
   than subtracting them to produce a strcmp-style return[1].
 - Add a check to see if the cookie is still hashed at the point of
   freeing.

ver #2:
 - Don't hold n_accesses elevated whilst cache is bound to a cookie, but
   rather add a flag that prevents the state machine from being queued when
   n_accesses reaches 0.
 - Remove the unused cookie pointer field from the fscache_acquire
   tracepoint.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/CAHk-=whtkzB446+hX0zdLsdcUJsJ=8_-0S1mE_R+YurThfUbLA@mail.gmail.com/ [1]
Link: https://lore.kernel.org/r/163819590658.215744.14934902514281054323.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906891983.143852.6219772337558577395.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967088507.1823006.12659006350221417165.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021498432.640689.12743483856927722772.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h |  22 +++++++
 include/linux/fscache.h       | 134 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 156 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 18cd5c9877bb..c4355b888c91 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -17,6 +17,7 @@
 #include <linux/fscache.h>
 
 enum fscache_cache_trace;
+enum fscache_cookie_trace;
 enum fscache_access_trace;
 
 enum fscache_cache_state {
@@ -52,4 +53,25 @@ extern struct rw_semaphore fscache_addremove_sem;
 extern struct fscache_cache *fscache_acquire_cache(const char *name);
 extern void fscache_relinquish_cache(struct fscache_cache *cache);
 
+extern struct fscache_cookie *fscache_get_cookie(struct fscache_cookie *cookie,
+						 enum fscache_cookie_trace where);
+extern void fscache_put_cookie(struct fscache_cookie *cookie,
+			       enum fscache_cookie_trace where);
+extern void fscache_set_cookie_state(struct fscache_cookie *cookie,
+				     enum fscache_cookie_state state);
+
+/**
+ * fscache_get_key - Get a pointer to the cookie key
+ * @cookie: The cookie to query
+ *
+ * Return a pointer to the where a cookie's key is stored.
+ */
+static inline void *fscache_get_key(struct fscache_cookie *cookie)
+{
+	if (cookie->key_len <= sizeof(cookie->inline_key))
+		return cookie->inline_key;
+	else
+		return cookie->key;
+}
+
 #endif /* _LINUX_FSCACHE_CACHE_H */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 131a741a6652..4450d17c11e8 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -31,6 +31,27 @@
 #define fscache_cookie_enabled(cookie) (0)
 #endif
 
+struct fscache_cookie;
+
+#define FSCACHE_ADV_SINGLE_CHUNK	0x01 /* The object is a single chunk of data */
+#define FSCACHE_ADV_WRITE_CACHE		0x00 /* Do cache if written to locally */
+#define FSCACHE_ADV_WRITE_NOCACHE	0x02 /* Don't cache if written to locally */
+
+/*
+ * Data object state.
+ */
+enum fscache_cookie_state {
+	FSCACHE_COOKIE_STATE_QUIESCENT,		/* The cookie is uncached */
+	FSCACHE_COOKIE_STATE_LOOKING_UP,	/* The cache object is being looked up */
+	FSCACHE_COOKIE_STATE_CREATING,		/* The cache object is being created */
+	FSCACHE_COOKIE_STATE_ACTIVE,		/* The cache is active, readable and writable */
+	FSCACHE_COOKIE_STATE_FAILED,		/* The cache failed, withdraw to clear */
+	FSCACHE_COOKIE_STATE_WITHDRAWING,	/* The cookie is being withdrawn */
+	FSCACHE_COOKIE_STATE_RELINQUISHING,	/* The cookie is being relinquished */
+	FSCACHE_COOKIE_STATE_DROPPED,		/* The cookie has been dropped */
+#define FSCACHE_COOKIE_STATE__NR (FSCACHE_COOKIE_STATE_DROPPED + 1)
+} __attribute__((mode(byte)));
+
 /*
  * Volume representation cookie.
  */
@@ -55,6 +76,60 @@ struct fscache_volume {
 #define FSCACHE_VOLUME_CREATING		4	/* Volume is being created on disk */
 };
 
+/*
+ * Data file representation cookie.
+ * - a file will only appear in one cache
+ * - a request to cache a file may or may not be honoured, subject to
+ *   constraints such as disk space
+ * - indices are created on disk just-in-time
+ */
+struct fscache_cookie {
+	refcount_t			ref;
+	atomic_t			n_active;	/* number of active users of cookie */
+	atomic_t			n_accesses;	/* Number of cache accesses in progress */
+	unsigned int			debug_id;
+	unsigned int			inval_counter;	/* Number of invalidations made */
+	spinlock_t			lock;
+	struct fscache_volume		*volume;	/* Parent volume of this file. */
+	void				*cache_priv;	/* Cache-side representation */
+	struct hlist_bl_node		hash_link;	/* Link in hash table */
+	struct list_head		proc_link;	/* Link in proc list */
+	struct list_head		commit_link;	/* Link in commit queue */
+	struct work_struct		work;		/* Commit/relinq/withdraw work */
+	loff_t				object_size;	/* Size of the netfs object */
+	unsigned long			unused_at;	/* Time at which unused (jiffies) */
+	unsigned long			flags;
+#define FSCACHE_COOKIE_RELINQUISHED	0		/* T if cookie has been relinquished */
+#define FSCACHE_COOKIE_RETIRED		1		/* T if this cookie has retired on relinq */
+#define FSCACHE_COOKIE_IS_CACHING	2		/* T if this cookie is cached */
+#define FSCACHE_COOKIE_NO_DATA_TO_READ	3		/* T if this cookie has nothing to read */
+#define FSCACHE_COOKIE_NEEDS_UPDATE	4		/* T if attrs have been updated */
+#define FSCACHE_COOKIE_HAS_BEEN_CACHED	5		/* T if cookie needs withdraw-on-relinq */
+#define FSCACHE_COOKIE_DISABLED		6		/* T if cookie has been disabled */
+#define FSCACHE_COOKIE_LOCAL_WRITE	7		/* T if cookie has been modified locally */
+#define FSCACHE_COOKIE_NO_ACCESS_WAKE	8		/* T if no wake when n_accesses goes 0 */
+#define FSCACHE_COOKIE_DO_RELINQUISH	9		/* T if this cookie needs relinquishment */
+#define FSCACHE_COOKIE_DO_WITHDRAW	10		/* T if this cookie needs withdrawing */
+#define FSCACHE_COOKIE_DO_LRU_DISCARD	11		/* T if this cookie needs LRU discard */
+#define FSCACHE_COOKIE_DO_PREP_TO_WRITE	12		/* T if cookie needs write preparation */
+#define FSCACHE_COOKIE_HAVE_DATA	13		/* T if this cookie has data stored */
+#define FSCACHE_COOKIE_IS_HASHED	14		/* T if this cookie is hashed */
+
+	enum fscache_cookie_state	state;
+	u8				advice;		/* FSCACHE_ADV_* */
+	u8				key_len;	/* Length of index key */
+	u8				aux_len;	/* Length of auxiliary data */
+	u32				key_hash;	/* Hash of volume, key, len */
+	union {
+		void			*key;		/* Index key */
+		u8			inline_key[16];	/* - If the key is short enough */
+	};
+	union {
+		void			*aux;		/* Auxiliary data */
+		u8			inline_aux[8];	/* - If the aux data is short enough */
+	};
+};
+
 /*
  * slow-path functions for when there is actually caching available, and the
  * netfs does actually have a valid token
@@ -66,6 +141,14 @@ extern struct fscache_volume *__fscache_acquire_volume(const char *, const char
 						       const void *, size_t);
 extern void __fscache_relinquish_volume(struct fscache_volume *, const void *, bool);
 
+extern struct fscache_cookie *__fscache_acquire_cookie(
+	struct fscache_volume *,
+	u8,
+	const void *, size_t,
+	const void *, size_t,
+	loff_t);
+extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool);
+
 /**
  * fscache_acquire_volume - Register a volume as desiring caching services
  * @volume_key: An identification string for the volume
@@ -113,4 +196,55 @@ void fscache_relinquish_volume(struct fscache_volume *volume,
 		__fscache_relinquish_volume(volume, coherency_data, invalidate);
 }
 
+/**
+ * fscache_acquire_cookie - Acquire a cookie to represent a cache object
+ * @volume: The volume in which to locate/create this cookie
+ * @advice: Advice flags (FSCACHE_COOKIE_ADV_*)
+ * @index_key: The index key for this cookie
+ * @index_key_len: Size of the index key
+ * @aux_data: The auxiliary data for the cookie (may be NULL)
+ * @aux_data_len: Size of the auxiliary data buffer
+ * @object_size: The initial size of object
+ *
+ * Acquire a cookie to represent a data file within the given cache volume.
+ *
+ * See Documentation/filesystems/caching/netfs-api.rst for a complete
+ * description.
+ */
+static inline
+struct fscache_cookie *fscache_acquire_cookie(struct fscache_volume *volume,
+					      u8 advice,
+					      const void *index_key,
+					      size_t index_key_len,
+					      const void *aux_data,
+					      size_t aux_data_len,
+					      loff_t object_size)
+{
+	if (!fscache_volume_valid(volume))
+		return NULL;
+	return __fscache_acquire_cookie(volume, advice,
+					index_key, index_key_len,
+					aux_data, aux_data_len,
+					object_size);
+}
+
+/**
+ * fscache_relinquish_cookie - Return the cookie to the cache, maybe discarding
+ * it
+ * @cookie: The cookie being returned
+ * @retire: True if the cache object the cookie represents is to be discarded
+ *
+ * This function returns a cookie to the cache, forcibly discarding the
+ * associated cache object if retire is set to true.
+ *
+ * See Documentation/filesystems/caching/netfs-api.rst for a complete
+ * description.
+ */
+static inline
+void fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
+{
+	if (fscache_cookie_valid(cookie))
+		__fscache_relinquish_cookie(cookie, retire);
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From e6acd3299badbfb5fb0231d42481d4f5dedf5599 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 15:26:17 +0100
Subject: fscache: Implement volume-level access helpers

Add a pair of helper functions to manage access to a volume, pinning the
volume in place for the duration to prevent cache withdrawal from removing
it:

	bool fscache_begin_volume_access(struct fscache_volume *volume,
					 enum fscache_access_trace why);
	void fscache_end_volume_access(struct fscache_volume *volume,
				       enum fscache_access_trace why);

The way the access gate on the volume works/will work is:

  (1) If the cache tests as not live (state is not FSCACHE_CACHE_IS_ACTIVE),
      then we return false to indicate access was not permitted.

  (2) If the cache tests as live, then we increment the volume's n_accesses
      count and then recheck the cache liveness, ending the access if it
      ceased to be live.

  (3) When we end the access, we decrement the volume's n_accesses and wake
      up the any waiters if it reaches 0.

  (4) Whilst the cache is caching, the volume's n_accesses is kept
      artificially incremented to prevent wakeups from happening.

  (5) When the cache is taken offline, the state is changed to prevent new
      accesses, the volume's n_accesses is decremented and we wait for it to
      become 0.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819594158.215744.8285859817391683254.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906894315.143852.5454793807544710479.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967095028.1823006.9173132503876627466.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021501546.640689.9631510472149608443.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index c4355b888c91..fbbd8a2afe12 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -53,6 +53,10 @@ extern struct rw_semaphore fscache_addremove_sem;
 extern struct fscache_cache *fscache_acquire_cache(const char *name);
 extern void fscache_relinquish_cache(struct fscache_cache *cache);
 
+extern void fscache_end_volume_access(struct fscache_volume *volume,
+				      struct fscache_cookie *cookie,
+				      enum fscache_access_trace why);
+
 extern struct fscache_cookie *fscache_get_cookie(struct fscache_cookie *cookie,
 						 enum fscache_cookie_trace where);
 extern void fscache_put_cookie(struct fscache_cookie *cookie,
-- 
cgit v1.2.3


From a7733fb632722a2f085f9324f14783effe268ed3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 15:53:34 +0100
Subject: fscache: Implement cookie-level access helpers

Add a number of helper functions to manage access to a cookie, pinning the
cache object in place for the duration to prevent cache withdrawal from
removing it:

 (1) void fscache_init_access_gate(struct fscache_cookie *cookie);

     This function initialises the access count when a cache binds to a
     cookie.  An extra ref is taken on the access count to prevent wakeups
     while the cache is active.  We're only interested in the wakeup when a
     cookie is being withdrawn and we're waiting for it to quiesce - at
     which point the counter will be decremented before the wait.

     The FSCACHE_COOKIE_NACC_ELEVATED flag is set on the cookie to keep
     track of the extra ref in order to handle a race between
     relinquishment and withdrawal both trying to drop the extra ref.

 (2) bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
				      enum fscache_access_trace why);

     This function attempts to begin access upon a cookie, pinning it in
     place if it's cached.  If successful, it returns true and leaves a the
     access count incremented.

 (3) void fscache_end_cookie_access(struct fscache_cookie *cookie,
				    enum fscache_access_trace why);

     This function drops the access count obtained by (2), permitting
     object withdrawal to take place when it reaches zero.

A tracepoint is provided to track changes to the access counter on a
cookie.

Changes
=======
ver #2:
 - Don't hold n_accesses elevated whilst cache is bound to a cookie, but
   rather add a flag that prevents the state machine from being queued when
   n_accesses reaches 0.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819595085.215744.1706073049250505427.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906895313.143852.10141619544149102193.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967095980.1823006.1133648159424418877.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021503063.640689.8870918985269528670.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index fbbd8a2afe12..66624407ba84 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -61,6 +61,8 @@ extern struct fscache_cookie *fscache_get_cookie(struct fscache_cookie *cookie,
 						 enum fscache_cookie_trace where);
 extern void fscache_put_cookie(struct fscache_cookie *cookie,
 			       enum fscache_cookie_trace where);
+extern void fscache_end_cookie_access(struct fscache_cookie *cookie,
+				      enum fscache_access_trace why);
 extern void fscache_set_cookie_state(struct fscache_cookie *cookie,
 				     enum fscache_cookie_state state);
 
-- 
cgit v1.2.3


From 2e0c76aee25f33c482abda6224bd87732359354d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 15:00:26 +0100
Subject: fscache: Implement functions add/remove a cache

Implement functions to allow the cache backend to add or remove a cache:

 (1) Declare a cache to be live:

	int fscache_add_cache(struct fscache_cache *cache,
			      const struct fscache_cache_ops *ops,
			      void *cache_priv);

     Take a previously acquired cache cookie, set the operations table and
     private data and mark the cache open for access.

 (2) Withdraw a cache from service:

	void fscache_withdraw_cache(struct fscache_cache *cache);

     This marks the cache as withdrawn and thus prevents further
     cache-level and volume-level accesses.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819596022.215744.8799712491432238827.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906896599.143852.17049208999019262884.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967097870.1823006.3470041000971522030.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021505541.640689.1819714759326331054.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 66624407ba84..f78add6e7823 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -33,6 +33,7 @@ enum fscache_cache_state {
  * Cache cookie.
  */
 struct fscache_cache {
+	const struct fscache_cache_ops *ops;
 	struct list_head	cache_link;	/* Link in cache list */
 	void			*cache_priv;	/* Private cache data (or NULL) */
 	refcount_t		ref;
@@ -44,6 +45,14 @@ struct fscache_cache {
 	char			*name;
 };
 
+/*
+ * cache operations
+ */
+struct fscache_cache_ops {
+	/* name of cache provider */
+	const char *name;
+};
+
 extern struct workqueue_struct *fscache_wq;
 
 /*
@@ -52,6 +61,10 @@ extern struct workqueue_struct *fscache_wq;
 extern struct rw_semaphore fscache_addremove_sem;
 extern struct fscache_cache *fscache_acquire_cache(const char *name);
 extern void fscache_relinquish_cache(struct fscache_cache *cache);
+extern int fscache_add_cache(struct fscache_cache *cache,
+			     const struct fscache_cache_ops *ops,
+			     void *cache_priv);
+extern void fscache_withdraw_cache(struct fscache_cache *cache);
 
 extern void fscache_end_volume_access(struct fscache_volume *volume,
 				      struct fscache_cookie *cookie,
-- 
cgit v1.2.3


From bfa22da3ed652aa15acd4246fa13a0de6dbe4a59 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 15:26:17 +0100
Subject: fscache: Provide and use cache methods to lookup/create/free a volume

Add cache methods to lookup, create and remove a volume.

Looking up or creating the volume requires the cache pinning for access;
freeing the volume requires the volume pinning for access.  The
->acquire_volume() method is used to ask the cache backend to lookup and,
if necessary, create a volume; the ->free_volume() method is used to free
the resources for a volume.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819597821.215744.5225318658134989949.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906898645.143852.8537799955945956818.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967099771.1823006.1455197910571061835.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021507345.640689.4073511598838843040.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index f78add6e7823..a10b66ca3544 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -51,6 +51,12 @@ struct fscache_cache {
 struct fscache_cache_ops {
 	/* name of cache provider */
 	const char *name;
+
+	/* Acquire a volume */
+	void (*acquire_volume)(struct fscache_volume *volume);
+
+	/* Free the cache's data attached to a volume */
+	void (*free_volume)(struct fscache_volume *volume);
 };
 
 extern struct workqueue_struct *fscache_wq;
@@ -65,6 +71,7 @@ extern int fscache_add_cache(struct fscache_cache *cache,
 			     const struct fscache_cache_ops *ops,
 			     void *cache_priv);
 extern void fscache_withdraw_cache(struct fscache_cache *cache);
+extern void fscache_withdraw_volume(struct fscache_volume *volume);
 
 extern void fscache_end_volume_access(struct fscache_volume *volume,
 				      struct fscache_cookie *cookie,
-- 
cgit v1.2.3


From 29f18e79fe7c5f8011befeda9be6b220a350f947 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 15:00:26 +0100
Subject: fscache: Add a function for a cache backend to note an I/O error

Add a function to the backend API to note an I/O error in a cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819598741.215744.891281275151382095.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906901316.143852.15225412215771586528.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967100721.1823006.16435671567428949398.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021508840.640689.11902836226570620424.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index a10b66ca3544..936ef731bbc7 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -73,6 +73,8 @@ extern int fscache_add_cache(struct fscache_cache *cache,
 extern void fscache_withdraw_cache(struct fscache_cache *cache);
 extern void fscache_withdraw_volume(struct fscache_volume *volume);
 
+extern void fscache_io_error(struct fscache_cache *cache);
+
 extern void fscache_end_volume_access(struct fscache_volume *volume,
 				      struct fscache_cookie *cookie,
 				      enum fscache_access_trace why);
-- 
cgit v1.2.3


From 5d00e426f95e7ea036fec2a0aceb3f71d6dbdf92 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 15:53:34 +0100
Subject: fscache: Implement simple cookie state machine

Implement a very simple cookie state machine to handle lookup,
invalidation, withdrawal, relinquishment and, to be added later, commit on
LRU discard.

Three cache methods are provided: ->lookup_cookie() to look up and, if
necessary, create a data storage object; ->withdraw_cookie() to free the
resources associated with that object and potentially delete it; and
->prepare_to_write(), to do prepare for changes to the cached data to be
modified locally.

Changes
=======
ver #3:
 - Fix a race between LRU discard and relinquishment whereby the former
   would override the latter and thus the latter would never happen[1].

ver #2:
 - Don't hold n_accesses elevated whilst cache is bound to a cookie, but
   rather add a flag that prevents the state machine from being queued when
   n_accesses reaches 0.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/599331.1639410068@warthog.procyon.org.uk/ [1]
Link: https://lore.kernel.org/r/163819599657.215744.15799615296912341745.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906903925.143852.1805855338154353867.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967105456.1823006.14730395299835841776.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021510706.640689.7961423370243272583.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 936ef731bbc7..ae6a75976450 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -57,6 +57,15 @@ struct fscache_cache_ops {
 
 	/* Free the cache's data attached to a volume */
 	void (*free_volume)(struct fscache_volume *volume);
+
+	/* Look up a cookie in the cache */
+	bool (*lookup_cookie)(struct fscache_cookie *cookie);
+
+	/* Withdraw an object without any cookie access counts held */
+	void (*withdraw_cookie)(struct fscache_cookie *cookie);
+
+	/* Prepare to write to a live cache object */
+	void (*prepare_to_write)(struct fscache_cookie *cookie);
 };
 
 extern struct workqueue_struct *fscache_wq;
@@ -72,6 +81,7 @@ extern int fscache_add_cache(struct fscache_cache *cache,
 			     void *cache_priv);
 extern void fscache_withdraw_cache(struct fscache_cache *cache);
 extern void fscache_withdraw_volume(struct fscache_volume *volume);
+extern void fscache_withdraw_cookie(struct fscache_cookie *cookie);
 
 extern void fscache_io_error(struct fscache_cache *cache);
 
@@ -85,8 +95,21 @@ extern void fscache_put_cookie(struct fscache_cookie *cookie,
 			       enum fscache_cookie_trace where);
 extern void fscache_end_cookie_access(struct fscache_cookie *cookie,
 				      enum fscache_access_trace why);
-extern void fscache_set_cookie_state(struct fscache_cookie *cookie,
-				     enum fscache_cookie_state state);
+extern void fscache_cookie_lookup_negative(struct fscache_cookie *cookie);
+extern void fscache_caching_failed(struct fscache_cookie *cookie);
+
+/**
+ * fscache_cookie_state - Read the state of a cookie
+ * @cookie: The cookie to query
+ *
+ * Get the state of a cookie, imposing an ordering between the cookie contents
+ * and the state value.  Paired with fscache_set_cookie_state().
+ */
+static inline
+enum fscache_cookie_state fscache_cookie_state(struct fscache_cookie *cookie)
+{
+	return smp_load_acquire(&cookie->state);
+}
 
 /**
  * fscache_get_key - Get a pointer to the cookie key
-- 
cgit v1.2.3


From 12bb21a29c19aae50cfad4e2bb5c943108f34a7d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 15:53:34 +0100
Subject: fscache: Implement cookie user counting and resource pinning

Provide a pair of functions to count the number of users of a cookie (open
files, writeback, invalidation, resizing, reads, writes), to obtain and pin
resources for the cookie and to prevent culling for the whilst there are
users.

The first function marks a cookie as being in use:

	void fscache_use_cookie(struct fscache_cookie *cookie,
				bool will_modify);

The caller should indicate the cookie to use and whether or not the caller
is in a context that may modify the cookie (e.g. a file open O_RDWR).

If the cookie is not already resourced, fscache will ask the cache backend
in the background to do whatever it needs to look up, create or otherwise
obtain the resources necessary to access data.  This is pinned to the
cookie and may not be culled, though it may be withdrawn if the cache as a
whole is withdrawn.

The second function removes the in-use mark from a cookie and, optionally,
updates the coherency data:

	void fscache_unuse_cookie(struct fscache_cookie *cookie,
				  const void *aux_data,
				  const loff_t *object_size);

If non-NULL, the aux_data buffer and/or the object_size will be saved into
the cookie and will be set on the backing store when the object is
committed.

If this removes the last usage on a cookie, the cookie is placed onto an
LRU list from which it will be removed and closed after a couple of seconds
if it doesn't get reused.  This prevents resource overload in the cache -
in particular it prevents it from holding too many files open.

Changes
=======
ver #2:
 - Fix fscache_unuse_cookie() to use atomic_dec_and_lock() to avoid a
   potential race if the cookie gets reused before it completes the
   unusement.
 - Added missing transition to LRU_DISCARDING state.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819600612.215744.13678350304176542741.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906907567.143852.16979631199380722019.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967106467.1823006.6790864931048582667.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021511674.640689.10084988363699111860.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 4450d17c11e8..e6c321e5bf73 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -22,12 +22,14 @@
 #define fscache_available() (1)
 #define fscache_volume_valid(volume) (volume)
 #define fscache_cookie_valid(cookie) (cookie)
-#define fscache_cookie_enabled(cookie) (cookie)
+#define fscache_resources_valid(cres) ((cres)->cache_priv)
+#define fscache_cookie_enabled(cookie) (cookie && !test_bit(FSCACHE_COOKIE_DISABLED, &cookie->flags))
 #else
 #define __fscache_available (0)
 #define fscache_available() (0)
 #define fscache_volume_valid(volume) (0)
 #define fscache_cookie_valid(cookie) (0)
+#define fscache_resources_valid(cres) (false)
 #define fscache_cookie_enabled(cookie) (0)
 #endif
 
@@ -46,6 +48,7 @@ enum fscache_cookie_state {
 	FSCACHE_COOKIE_STATE_CREATING,		/* The cache object is being created */
 	FSCACHE_COOKIE_STATE_ACTIVE,		/* The cache is active, readable and writable */
 	FSCACHE_COOKIE_STATE_FAILED,		/* The cache failed, withdraw to clear */
+	FSCACHE_COOKIE_STATE_LRU_DISCARDING,	/* The cookie is being discarded by the LRU */
 	FSCACHE_COOKIE_STATE_WITHDRAWING,	/* The cookie is being withdrawn */
 	FSCACHE_COOKIE_STATE_RELINQUISHING,	/* The cookie is being relinquished */
 	FSCACHE_COOKIE_STATE_DROPPED,		/* The cookie has been dropped */
@@ -147,6 +150,8 @@ extern struct fscache_cookie *__fscache_acquire_cookie(
 	const void *, size_t,
 	const void *, size_t,
 	loff_t);
+extern void __fscache_use_cookie(struct fscache_cookie *, bool);
+extern void __fscache_unuse_cookie(struct fscache_cookie *, const void *, const loff_t *);
 extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool);
 
 /**
@@ -228,6 +233,39 @@ struct fscache_cookie *fscache_acquire_cookie(struct fscache_volume *volume,
 					object_size);
 }
 
+/**
+ * fscache_use_cookie - Request usage of cookie attached to an object
+ * @object: Object description
+ * @will_modify: If cache is expected to be modified locally
+ *
+ * Request usage of the cookie attached to an object.  The caller should tell
+ * the cache if the object's contents are about to be modified locally and then
+ * the cache can apply the policy that has been set to handle this case.
+ */
+static inline void fscache_use_cookie(struct fscache_cookie *cookie,
+				      bool will_modify)
+{
+	if (fscache_cookie_valid(cookie))
+		__fscache_use_cookie(cookie, will_modify);
+}
+
+/**
+ * fscache_unuse_cookie - Cease usage of cookie attached to an object
+ * @object: Object description
+ * @aux_data: Updated auxiliary data (or NULL)
+ * @object_size: Revised size of the object (or NULL)
+ *
+ * Cease usage of the cookie attached to an object.  When the users count
+ * reaches zero then the cookie relinquishment will be permitted to proceed.
+ */
+static inline void fscache_unuse_cookie(struct fscache_cookie *cookie,
+					const void *aux_data,
+					const loff_t *object_size)
+{
+	if (fscache_cookie_valid(cookie))
+		__fscache_unuse_cookie(cookie, aux_data, object_size);
+}
+
 /**
  * fscache_relinquish_cookie - Return the cookie to the cache, maybe discarding
  * it
@@ -247,4 +285,46 @@ void fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
 		__fscache_relinquish_cookie(cookie, retire);
 }
 
+/*
+ * Find the auxiliary data on a cookie.
+ */
+static inline void *fscache_get_aux(struct fscache_cookie *cookie)
+{
+	if (cookie->aux_len <= sizeof(cookie->inline_aux))
+		return cookie->inline_aux;
+	else
+		return cookie->aux;
+}
+
+/*
+ * Update the auxiliary data on a cookie.
+ */
+static inline
+void fscache_update_aux(struct fscache_cookie *cookie,
+			const void *aux_data, const loff_t *object_size)
+{
+	void *p = fscache_get_aux(cookie);
+
+	if (aux_data && p)
+		memcpy(p, aux_data, cookie->aux_len);
+	if (object_size)
+		cookie->object_size = *object_size;
+}
+
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_updates;
+#endif
+
+static inline
+void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data,
+			     const loff_t *object_size)
+{
+#ifdef CONFIG_FSCACHE_STATS
+	atomic_inc(&fscache_n_updates);
+#endif
+	fscache_update_aux(cookie, aux_data, object_size);
+	smp_wmb();
+	set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags);
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From d24af13e2e2358a602740c7817ea90da43d3e740 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 15:53:34 +0100
Subject: fscache: Implement cookie invalidation

Add a function to invalidate the cache behind a cookie:

	void fscache_invalidate(struct fscache_cookie *cookie,
				const void *aux_data,
				loff_t size,
				unsigned int flags)

This causes any cached data for the specified cookie to be discarded.  If
the cookie is marked as being in use, a new cache object will be created if
possible and future I/O will use that instead.  In-flight I/O should be
abandoned (writes) or reconsidered (reads).  Each time it is called
cookie->inval_counter is incremented and this can be used to detect
invalidation at the end of an I/O operation.

The coherency data attached to the cookie can be updated and the cookie
size should be reset.  One flag is available, FSCACHE_INVAL_DIO_WRITE,
which should be used to indicate invalidation due to a DIO write on a
file.  This will temporarily disable caching for this cookie.

Changes
=======
ver #2:
 - Should only change to inval state if can get access to cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819602231.215744.11206598147269491575.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906909707.143852.18056070560477964891.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967107447.1823006.5945029409592119962.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021512640.640689.11418616313147754172.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h |  4 ++++
 include/linux/fscache.h       | 31 +++++++++++++++++++++++++++++++
 include/linux/netfs.h         |  1 +
 3 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index ae6a75976450..1ad56bfd9d72 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -64,6 +64,9 @@ struct fscache_cache_ops {
 	/* Withdraw an object without any cookie access counts held */
 	void (*withdraw_cookie)(struct fscache_cookie *cookie);
 
+	/* Invalidate an object */
+	bool (*invalidate_cookie)(struct fscache_cookie *cookie);
+
 	/* Prepare to write to a live cache object */
 	void (*prepare_to_write)(struct fscache_cookie *cookie);
 };
@@ -96,6 +99,7 @@ extern void fscache_put_cookie(struct fscache_cookie *cookie,
 extern void fscache_end_cookie_access(struct fscache_cookie *cookie,
 				      enum fscache_access_trace why);
 extern void fscache_cookie_lookup_negative(struct fscache_cookie *cookie);
+extern void fscache_resume_after_invalidation(struct fscache_cookie *cookie);
 extern void fscache_caching_failed(struct fscache_cookie *cookie);
 
 /**
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index e6c321e5bf73..0f36d1fac237 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -39,6 +39,8 @@ struct fscache_cookie;
 #define FSCACHE_ADV_WRITE_CACHE		0x00 /* Do cache if written to locally */
 #define FSCACHE_ADV_WRITE_NOCACHE	0x02 /* Don't cache if written to locally */
 
+#define FSCACHE_INVAL_DIO_WRITE		0x01 /* Invalidate due to DIO write */
+
 /*
  * Data object state.
  */
@@ -47,6 +49,7 @@ enum fscache_cookie_state {
 	FSCACHE_COOKIE_STATE_LOOKING_UP,	/* The cache object is being looked up */
 	FSCACHE_COOKIE_STATE_CREATING,		/* The cache object is being created */
 	FSCACHE_COOKIE_STATE_ACTIVE,		/* The cache is active, readable and writable */
+	FSCACHE_COOKIE_STATE_INVALIDATING,	/* The cache is being invalidated */
 	FSCACHE_COOKIE_STATE_FAILED,		/* The cache failed, withdraw to clear */
 	FSCACHE_COOKIE_STATE_LRU_DISCARDING,	/* The cookie is being discarded by the LRU */
 	FSCACHE_COOKIE_STATE_WITHDRAWING,	/* The cookie is being withdrawn */
@@ -153,6 +156,7 @@ extern struct fscache_cookie *__fscache_acquire_cookie(
 extern void __fscache_use_cookie(struct fscache_cookie *, bool);
 extern void __fscache_unuse_cookie(struct fscache_cookie *, const void *, const loff_t *);
 extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool);
+extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t, unsigned int);
 
 /**
  * fscache_acquire_volume - Register a volume as desiring caching services
@@ -327,4 +331,31 @@ void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data
 	set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags);
 }
 
+/**
+ * fscache_invalidate - Notify cache that an object needs invalidation
+ * @cookie: The cookie representing the cache object
+ * @aux_data: The updated auxiliary data for the cookie (may be NULL)
+ * @size: The revised size of the object.
+ * @flags: Invalidation flags (FSCACHE_INVAL_*)
+ *
+ * Notify the cache that an object is needs to be invalidated and that it
+ * should abort any retrievals or stores it is doing on the cache.  This
+ * increments inval_counter on the cookie which can be used by the caller to
+ * reconsider I/O requests as they complete.
+ *
+ * If @flags has FSCACHE_INVAL_DIO_WRITE set, this indicates that this is due
+ * to a direct I/O write and will cause caching to be disabled on this cookie
+ * until it is completely unused.
+ *
+ * See Documentation/filesystems/caching/netfs-api.rst for a complete
+ * description.
+ */
+static inline
+void fscache_invalidate(struct fscache_cookie *cookie,
+			const void *aux_data, loff_t size, unsigned int flags)
+{
+	if (fscache_cookie_enabled(cookie))
+		__fscache_invalidate(cookie, aux_data, size, flags);
+}
+
 #endif /* _LINUX_FSCACHE_H */
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 1ea22fc48818..5a46fde65759 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -124,6 +124,7 @@ struct netfs_cache_resources {
 	void				*cache_priv;
 	void				*cache_priv2;
 	unsigned int			debug_id;	/* Cookie debug ID */
+	unsigned int			inval_counter;	/* object->inval_counter at begin_op */
 };
 
 /*
-- 
cgit v1.2.3


From d64f4554dd177c5891c02424a8d9e80590b55b35 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 14:06:34 +0100
Subject: fscache: Provide a means to begin an operation

Provide a function to begin a read operation:

	int fscache_begin_read_operation(
		struct netfs_cache_resources *cres,
		struct fscache_cookie *cookie)

This is primarily intended to be called by network filesystems on behalf of
netfslib, but may also be called to use the I/O access functions directly.
It attaches the resources required by the cache to cres struct from the
supplied cookie.

This holds access to the cache behind the cookie for the duration of the
operation and forces cache withdrawal and cookie invalidation to perform
synchronisation on the operation.  cres->inval_counter is set from the
cookie at this point so that it can be compared at the end of the
operation.

Note that this does not guarantee that the cache state is fully set up and
able to perform I/O immediately; looking up and creation may be left in
progress in the background.  The operations intended to be called by the
network filesystem, such as reading and writing, are expected to wait for
the cookie to move to the correct state.

This will, however, potentially sleep, waiting for a certain minimum state
to be set or for operations such as invalidate to advance far enough that
I/O can resume.


Also provide a function for the cache to call to wait for the cache object
to get to a state where it can be used for certain things:

	bool fscache_wait_for_operation(struct netfs_cache_resources *cres,
					enum fscache_want_stage stage);

This looks at the cache resources provided by the begin function and waits
for them to get to an appropriate stage.  There's a choice of wanting just
some parameters (FSCACHE_WANT_PARAM) or the ability to do I/O
(FSCACHE_WANT_READ or FSCACHE_WANT_WRITE).

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819603692.215744.146724961588817028.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906910672.143852.13856103384424986357.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967110245.1823006.2239170567540431836.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021513617.640689.16627329360866150606.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 11 ++++++++++
 include/linux/fscache.h       | 49 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 1ad56bfd9d72..566497cf5f13 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -67,6 +67,10 @@ struct fscache_cache_ops {
 	/* Invalidate an object */
 	bool (*invalidate_cookie)(struct fscache_cookie *cookie);
 
+	/* Begin an operation for the netfs lib */
+	bool (*begin_operation)(struct netfs_cache_resources *cres,
+				enum fscache_want_state want_state);
+
 	/* Prepare to write to a live cache object */
 	void (*prepare_to_write)(struct fscache_cookie *cookie);
 };
@@ -101,6 +105,8 @@ extern void fscache_end_cookie_access(struct fscache_cookie *cookie,
 extern void fscache_cookie_lookup_negative(struct fscache_cookie *cookie);
 extern void fscache_resume_after_invalidation(struct fscache_cookie *cookie);
 extern void fscache_caching_failed(struct fscache_cookie *cookie);
+extern bool fscache_wait_for_operation(struct netfs_cache_resources *cred,
+				       enum fscache_want_state state);
 
 /**
  * fscache_cookie_state - Read the state of a cookie
@@ -129,4 +135,9 @@ static inline void *fscache_get_key(struct fscache_cookie *cookie)
 		return cookie->key;
 }
 
+static inline struct fscache_cookie *fscache_cres_cookie(struct netfs_cache_resources *cres)
+{
+	return cres->cache_priv;
+}
+
 #endif /* _LINUX_FSCACHE_CACHE_H */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 0f36d1fac237..7cdc63c4fe35 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -41,6 +41,12 @@ struct fscache_cookie;
 
 #define FSCACHE_INVAL_DIO_WRITE		0x01 /* Invalidate due to DIO write */
 
+enum fscache_want_state {
+	FSCACHE_WANT_PARAMS,
+	FSCACHE_WANT_WRITE,
+	FSCACHE_WANT_READ,
+};
+
 /*
  * Data object state.
  */
@@ -157,6 +163,7 @@ extern void __fscache_use_cookie(struct fscache_cookie *, bool);
 extern void __fscache_unuse_cookie(struct fscache_cookie *, const void *, const loff_t *);
 extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool);
 extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t, unsigned int);
+extern int __fscache_begin_read_operation(struct netfs_cache_resources *, struct fscache_cookie *);
 
 /**
  * fscache_acquire_volume - Register a volume as desiring caching services
@@ -358,4 +365,46 @@ void fscache_invalidate(struct fscache_cookie *cookie,
 		__fscache_invalidate(cookie, aux_data, size, flags);
 }
 
+/**
+ * fscache_operation_valid - Return true if operations resources are usable
+ * @cres: The resources to check.
+ *
+ * Returns a pointer to the operations table if usable or NULL if not.
+ */
+static inline
+const struct netfs_cache_ops *fscache_operation_valid(const struct netfs_cache_resources *cres)
+{
+	return fscache_resources_valid(cres) ? cres->ops : NULL;
+}
+
+/**
+ * fscache_begin_read_operation - Begin a read operation for the netfs lib
+ * @cres: The cache resources for the read being performed
+ * @cookie: The cookie representing the cache object
+ *
+ * Begin a read operation on behalf of the netfs helper library.  @cres
+ * indicates the cache resources to which the operation state should be
+ * attached; @cookie indicates the cache object that will be accessed.
+ *
+ * This is intended to be called from the ->begin_cache_operation() netfs lib
+ * operation as implemented by the network filesystem.
+ *
+ * @cres->inval_counter is set from @cookie->inval_counter for comparison at
+ * the end of the operation.  This allows invalidation during the operation to
+ * be detected by the caller.
+ *
+ * Returns:
+ * * 0		- Success
+ * * -ENOBUFS	- No caching available
+ * * Other error code from the cache, such as -ENOMEM.
+ */
+static inline
+int fscache_begin_read_operation(struct netfs_cache_resources *cres,
+				 struct fscache_cookie *cookie)
+{
+	if (fscache_cookie_enabled(cookie))
+		return __fscache_begin_read_operation(cres, cookie);
+	return -ENOBUFS;
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From cdf262f29488e6c3432911ec487ea41918fcbcd7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Nov 2021 23:14:29 +0000
Subject: fscache: Count data storage objects in a cache

Count the data storage objects that are currently allocated in a cache.
This is used to pin certain cache structures until cache withdrawal is
complete.

Three helpers are provided to manage and make use of the count:

 (1) void fscache_count_object(struct fscache_cache *cache);

     This should be called by the cache backend to note that an object has
     been allocated and attached to the cache.

 (2) void fscache_uncount_object(struct fscache_cache *cache);

     This should be called by the backend to note that an object has been
     destroyed.  This sends a wakeup event that allows cache withdrawal to
     proceed if it was waiting for that object.

 (3) void fscache_wait_for_objects(struct fscache_cache *cache);

     This can be used by the backend to wait for all outstanding cache
     object to be destroyed.

Each cache's counter is displayed as part of /proc/fs/fscache/caches.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819608594.215744.1812706538117388252.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906911646.143852.168184059935530127.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967111846.1823006.9868154941573671255.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021516219.640689.4934796654308958158.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 566497cf5f13..337335d7a5e2 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -76,6 +76,7 @@ struct fscache_cache_ops {
 };
 
 extern struct workqueue_struct *fscache_wq;
+extern wait_queue_head_t fscache_clearance_waiters;
 
 /*
  * out-of-line cache backend functions
@@ -140,4 +141,42 @@ static inline struct fscache_cookie *fscache_cres_cookie(struct netfs_cache_reso
 	return cres->cache_priv;
 }
 
+/**
+ * fscache_count_object - Tell fscache that an object has been added
+ * @cache: The cache to account to
+ *
+ * Tell fscache that an object has been added to the cache.  This prevents the
+ * cache from tearing down the cache structure until the object is uncounted.
+ */
+static inline void fscache_count_object(struct fscache_cache *cache)
+{
+	atomic_inc(&cache->object_count);
+}
+
+/**
+ * fscache_uncount_object - Tell fscache that an object has been removed
+ * @cache: The cache to account to
+ *
+ * Tell fscache that an object has been removed from the cache and will no
+ * longer be accessed.  After this point, the cache cookie may be destroyed.
+ */
+static inline void fscache_uncount_object(struct fscache_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->object_count))
+		wake_up_all(&fscache_clearance_waiters);
+}
+
+/**
+ * fscache_wait_for_objects - Wait for all objects to be withdrawn
+ * @cache: The cache to query
+ *
+ * Wait for all extant objects in a cache to finish being withdrawn
+ * and go away.
+ */
+static inline void fscache_wait_for_objects(struct fscache_cache *cache)
+{
+	wait_event(fscache_clearance_waiters,
+		   atomic_read(&cache->object_count) == 0);
+}
+
 #endif /* _LINUX_FSCACHE_CACHE_H */
-- 
cgit v1.2.3


From 8e7a867bb7309fbf47e8c2a68798b919fc02523f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 23:06:16 +0100
Subject: fscache: Provide read/write stat counters for the cache

Provide read/write stat counters for the cache backend to use.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819609532.215744.10821082637727410554.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906912598.143852.12960327989649429069.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967113830.1823006.3222957649202368162.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021517502.640689.6077928311710357342.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 337335d7a5e2..796c8b5c5305 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -179,4 +179,14 @@ static inline void fscache_wait_for_objects(struct fscache_cache *cache)
 		   atomic_read(&cache->object_count) == 0);
 }
 
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_read;
+extern atomic_t fscache_n_write;
+#define fscache_count_read() atomic_inc(&fscache_n_read)
+#define fscache_count_write() atomic_inc(&fscache_n_write)
+#else
+#define fscache_count_read() do {} while(0)
+#define fscache_count_write() do {} while(0)
+#endif
+
 #endif /* _LINUX_FSCACHE_CACHE_H */
-- 
cgit v1.2.3


From ed1235eb78a7421cd0ac2ad09e931f8f07ccdc7c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 23:10:46 +0100
Subject: fscache: Provide a function to let the netfs update its coherency
 data

Provide a function to let the netfs update its coherency data:

	void fscache_update_cookie(struct fscache_cookie *cookie,
				   const void *aux_data,
				   const loff_t *object_size);

This will update the auxiliary data and/or the size of the object attached
to a cookie if either pointer is not-NULL and flag that the disk needs to
be updated.

Note that fscache_unuse_cookie() also allows this to be done.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819610438.215744.4223265964131424954.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906913530.143852.18150303220217653820.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967117795.1823006.7493373142653442595.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021518440.640689.6369952464473039268.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 7cdc63c4fe35..fc77648c8af6 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -338,6 +338,28 @@ void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data
 	set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags);
 }
 
+/**
+ * fscache_update_cookie - Request that a cache object be updated
+ * @cookie: The cookie representing the cache object
+ * @aux_data: The updated auxiliary data for the cookie (may be NULL)
+ * @object_size: The current size of the object (may be NULL)
+ *
+ * Request an update of the index data for the cache object associated with the
+ * cookie.  The auxiliary data on the cookie will be updated first if @aux_data
+ * is set and the object size will be updated and the object possibly trimmed
+ * if @object_size is set.
+ *
+ * See Documentation/filesystems/caching/netfs-api.rst for a complete
+ * description.
+ */
+static inline
+void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data,
+			   const loff_t *object_size)
+{
+	if (fscache_cookie_enabled(cookie))
+		__fscache_update_cookie(cookie, aux_data, object_size);
+}
+
 /**
  * fscache_invalidate - Notify cache that an object needs invalidation
  * @cookie: The cookie representing the cache object
-- 
cgit v1.2.3


From 3a11b3a86366ccbf0818b088ffecadf8b2d61177 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 14 Sep 2021 09:47:45 +0100
Subject: netfs: Pass more information on how to deal with a hole in the cache

Pass more information to the cache on how to deal with a hole if it
encounters one when trying to read from the cache.  Three options are
provided:

 (1) NETFS_READ_HOLE_IGNORE.  Read the hole along with the data, assuming
     it to be a punched-out extent by the backing filesystem.

 (2) NETFS_READ_HOLE_CLEAR.  If there's a hole, erase the requested region
     of the cache and clear the read buffer.

 (3) NETFS_READ_HOLE_FAIL.  Fail the read if a hole is detected.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819612321.215744.9738308885948264476.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906914460.143852.6284247083607910189.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967119923.1823006.15637375885194297582.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021519762.640689.16994364383313159319.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/netfs.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 5a46fde65759..b46c39d98bbd 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -196,6 +196,15 @@ struct netfs_read_request_ops {
 	void (*cleanup)(struct address_space *mapping, void *netfs_priv);
 };
 
+/*
+ * How to handle reading from a hole.
+ */
+enum netfs_read_from_hole {
+	NETFS_READ_HOLE_IGNORE,
+	NETFS_READ_HOLE_CLEAR,
+	NETFS_READ_HOLE_FAIL,
+};
+
 /*
  * Table of operations for access to a cache.  This is obtained by
  * rreq->ops->begin_cache_operation().
@@ -208,7 +217,7 @@ struct netfs_cache_ops {
 	int (*read)(struct netfs_cache_resources *cres,
 		    loff_t start_pos,
 		    struct iov_iter *iter,
-		    bool seek_data,
+		    enum netfs_read_from_hole read_hole,
 		    netfs_io_terminated_t term_func,
 		    void *term_func_priv);
 
-- 
cgit v1.2.3


From 9af1c6c3089b294ffa240e0fbba356666698b6d0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 14:06:34 +0100
Subject: fscache: Implement raw I/O interface

Provide a pair of functions to perform raw I/O on the cache.  The first
function allows an arbitrary asynchronous direct-IO read to be made against
a cache object, though the read should be aligned and sized appropriately
for the backing device:

        int fscache_read(struct netfs_cache_resources *cres,
                         loff_t start_pos,
                         struct iov_iter *iter,
                         enum netfs_read_from_hole read_hole,
                         netfs_io_terminated_t term_func,
                         void *term_func_priv);

The cache resources must have been previously initialised by
fscache_begin_read_operation().  A read operation is sent to the backing
filesystem, starting at start_pos within the file.  The size of the read is
specified by the iterator, as is the location of the output buffer.

If there is a hole in the data it can be ignored and left to the backing
filesystem to deal with (NETFS_READ_HOLE_IGNORE), a hole at the beginning
can be skipped over and the buffer padded with zeros
(NETFS_READ_HOLE_CLEAR) or -ENODATA can be given (NETFS_READ_HOLE_FAIL).

If term_func is not NULL, the operation may be performed asynchronously.
Upon completion, successful or otherwise, (*term_func)() will be called and
passed term_func_priv, along with an error or the amount of data
transferred.  If the op is run asynchronously, fscache_read() will return
-EIOCBQUEUED.

The second function allows an arbitrary asynchronous direct-IO write to be
made against a cache object, though the write should be aligned and sized
appropriately for the backing device:

        int fscache_write(struct netfs_cache_resources *cres,
                          loff_t start_pos,
                          struct iov_iter *iter,
                          netfs_io_terminated_t term_func,
                          void *term_func_priv);

This works in very similar way to fscache_read(), except that there's no
need to deal with holes (they're just overwritten).

The caller is responsible for preventing concurrent overlapping writes.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819613224.215744.7877577215582621254.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906915386.143852.16936177636106480724.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967122632.1823006.7487049517698562172.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021521420.640689.12747258780542678309.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index fc77648c8af6..ae753cae0fdd 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -429,4 +429,78 @@ int fscache_begin_read_operation(struct netfs_cache_resources *cres,
 	return -ENOBUFS;
 }
 
+/**
+ * fscache_read - Start a read from the cache.
+ * @cres: The cache resources to use
+ * @start_pos: The beginning file offset in the cache file
+ * @iter: The buffer to fill - and also the length
+ * @read_hole: How to handle a hole in the data.
+ * @term_func: The function to call upon completion
+ * @term_func_priv: The private data for @term_func
+ *
+ * Start a read from the cache.  @cres indicates the cache object to read from
+ * and must be obtained by a call to fscache_begin_operation() beforehand.
+ *
+ * The data is read into the iterator, @iter, and that also indicates the size
+ * of the operation.  @start_pos is the start position in the file, though if
+ * @seek_data is set appropriately, the cache can use SEEK_DATA to find the
+ * next piece of data, writing zeros for the hole into the iterator.
+ *
+ * Upon termination of the operation, @term_func will be called and supplied
+ * with @term_func_priv plus the amount of data written, if successful, or the
+ * error code otherwise.
+ *
+ * @read_hole indicates how a partially populated region in the cache should be
+ * handled.  It can be one of a number of settings:
+ *
+ *	NETFS_READ_HOLE_IGNORE - Just try to read (may return a short read).
+ *
+ *	NETFS_READ_HOLE_CLEAR - Seek for data, clearing the part of the buffer
+ *				skipped over, then do as for IGNORE.
+ *
+ *	NETFS_READ_HOLE_FAIL - Give ENODATA if we encounter a hole.
+ */
+static inline
+int fscache_read(struct netfs_cache_resources *cres,
+		 loff_t start_pos,
+		 struct iov_iter *iter,
+		 enum netfs_read_from_hole read_hole,
+		 netfs_io_terminated_t term_func,
+		 void *term_func_priv)
+{
+	const struct netfs_cache_ops *ops = fscache_operation_valid(cres);
+	return ops->read(cres, start_pos, iter, read_hole,
+			 term_func, term_func_priv);
+}
+
+/**
+ * fscache_write - Start a write to the cache.
+ * @cres: The cache resources to use
+ * @start_pos: The beginning file offset in the cache file
+ * @iter: The data to write - and also the length
+ * @term_func: The function to call upon completion
+ * @term_func_priv: The private data for @term_func
+ *
+ * Start a write to the cache.  @cres indicates the cache object to write to and
+ * must be obtained by a call to fscache_begin_operation() beforehand.
+ *
+ * The data to be written is obtained from the iterator, @iter, and that also
+ * indicates the size of the operation.  @start_pos is the start position in
+ * the file.
+ *
+ * Upon termination of the operation, @term_func will be called and supplied
+ * with @term_func_priv plus the amount of data written, if successful, or the
+ * error code otherwise.
+ */
+static inline
+int fscache_write(struct netfs_cache_resources *cres,
+		  loff_t start_pos,
+		  struct iov_iter *iter,
+		  netfs_io_terminated_t term_func,
+		  void *term_func_priv)
+{
+	const struct netfs_cache_ops *ops = fscache_operation_valid(cres);
+	return ops->write(cres, start_pos, iter, term_func, term_func_priv);
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From b6e16652d6c0e4f9e9b120f66966ec153f0623fc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 14:06:34 +0100
Subject: fscache: Implement higher-level write I/O interface

Provide a higher-level function than fscache_write() to perform a write
from an inode's pagecache to the cache, whilst fending off concurrent
writes by means of the PG_fscache mark on a page:

	void fscache_write_to_cache(struct fscache_cookie *cookie,
				    struct address_space *mapping,
				    loff_t start,
				    size_t len,
				    loff_t i_size,
				    netfs_io_terminated_t term_func,
				    void *term_func_priv,
				    bool caching);

If caching is false, this function does nothing except call (*term_func)()
if given.  It assumes that, in such a case, PG_fscache will not have been
set on the pages.

Otherwise, if caching is true, this function requires the source pages to
have had PG_fscache set on them before calling.  start and len define the
region of the file to be modified and i_size indicates the new file size.
The source pages are extracted from the mapping.

term_func and term_func_priv work as for fscache_write().  The PG_fscache
marks will be cleared at the end of the operation, before term_func is
called or the function otherwise returns.

There is an additonal helper function to clear the PG_fscache bits from a
range of pages:

	void fscache_clear_page_bits(struct fscache_cookie *cookie,
				     struct address_space *mapping,
				     loff_t start, size_t len,
				     bool caching);

If caching is true, the pages to be managed are expected to be located on
mapping in the range defined by start and len.  If caching is false, it
does nothing.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819614155.215744.5528123235123721230.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906916346.143852.15632773570362489926.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967123599.1823006.12946816026724657428.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021522672.640689.4381958316198807813.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index ae753cae0fdd..9d469613e16c 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -165,6 +165,11 @@ extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool);
 extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t, unsigned int);
 extern int __fscache_begin_read_operation(struct netfs_cache_resources *, struct fscache_cookie *);
 
+extern void __fscache_write_to_cache(struct fscache_cookie *, struct address_space *,
+				     loff_t, size_t, loff_t, netfs_io_terminated_t, void *,
+				     bool);
+extern void __fscache_clear_page_bits(struct address_space *, loff_t, size_t);
+
 /**
  * fscache_acquire_volume - Register a volume as desiring caching services
  * @volume_key: An identification string for the volume
@@ -503,4 +508,62 @@ int fscache_write(struct netfs_cache_resources *cres,
 	return ops->write(cres, start_pos, iter, term_func, term_func_priv);
 }
 
+/**
+ * fscache_clear_page_bits - Clear the PG_fscache bits from a set of pages
+ * @cookie: The cookie representing the cache object
+ * @mapping: The netfs inode to use as the source
+ * @start: The start position in @mapping
+ * @len: The amount of data to unlock
+ * @caching: If PG_fscache has been set
+ *
+ * Clear the PG_fscache flag from a sequence of pages and wake up anyone who's
+ * waiting.
+ */
+static inline void fscache_clear_page_bits(struct fscache_cookie *cookie,
+					   struct address_space *mapping,
+					   loff_t start, size_t len,
+					   bool caching)
+{
+	if (caching)
+		__fscache_clear_page_bits(mapping, start, len);
+}
+
+/**
+ * fscache_write_to_cache - Save a write to the cache and clear PG_fscache
+ * @cookie: The cookie representing the cache object
+ * @mapping: The netfs inode to use as the source
+ * @start: The start position in @mapping
+ * @len: The amount of data to write back
+ * @i_size: The new size of the inode
+ * @term_func: The function to call upon completion
+ * @term_func_priv: The private data for @term_func
+ * @caching: If PG_fscache has been set
+ *
+ * Helper function for a netfs to write dirty data from an inode into the cache
+ * object that's backing it.
+ *
+ * @start and @len describe the range of the data.  This does not need to be
+ * page-aligned, but to satisfy DIO requirements, the cache may expand it up to
+ * the page boundaries on either end.  All the pages covering the range must be
+ * marked with PG_fscache.
+ *
+ * If given, @term_func will be called upon completion and supplied with
+ * @term_func_priv.  Note that the PG_fscache flags will have been cleared by
+ * this point, so the netfs must retain its own pin on the mapping.
+ */
+static inline void fscache_write_to_cache(struct fscache_cookie *cookie,
+					  struct address_space *mapping,
+					  loff_t start, size_t len, loff_t i_size,
+					  netfs_io_terminated_t term_func,
+					  void *term_func_priv,
+					  bool caching)
+{
+	if (caching)
+		__fscache_write_to_cache(cookie, mapping, start, len, i_size,
+					 term_func, term_func_priv, caching);
+	else if (term_func)
+		term_func(term_func_priv, -ENOBUFS, false);
+
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From 08276bdae68b022a7726edf7416b6748e3df5395 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 23:50:01 +0100
Subject: vfs, fscache: Implement pinning of cache usage for writeback

Cachefiles has a problem in that it needs to keep the backing file for a
cookie open whilst there are local modifications pending that need to be
written to it.  However, we don't want to keep the file open indefinitely,
as that causes EMFILE/ENFILE/ENOMEM problems.

Reopening the cache file, however, is a problem if this is being done due
to writeback triggered by exit().  Some filesystems will oops if we try to
open a file in that context because they want to access current->fs or
other resources that have already been dismantled.

To get around this, I added the following:

 (1) An inode flag, I_PINNING_FSCACHE_WB, to be set on a network filesystem
     inode to indicate that we have a usage count on the cookie caching
     that inode.

 (2) A flag in struct writeback_control, unpinned_fscache_wb, that is set
     when __writeback_single_inode() clears the last dirty page from
     i_pages - at which point it clears I_PINNING_FSCACHE_WB and sets this
     flag.

     This has to be done here so that clearing I_PINNING_FSCACHE_WB can be
     done atomically with the check of PAGECACHE_TAG_DIRTY that clears
     I_DIRTY_PAGES.

 (3) A function, fscache_set_page_dirty(), which if it is not set, sets
     I_PINNING_FSCACHE_WB and calls fscache_use_cookie() to pin the cache
     resources.

 (4) A function, fscache_unpin_writeback(), to be called by ->write_inode()
     to unuse the cookie.

 (5) A function, fscache_clear_inode_writeback(), to be called when the
     inode is evicted, before clear_inode() is called.  This cleans up any
     lingering I_PINNING_FSCACHE_WB.

The network filesystem can then use these tools to make sure that
fscache_write_to_cache() can write locally modified data to the cache as
well as to the server.

For the future, I'm working on write helpers for netfs lib that should
allow this facility to be removed by keeping track of the dirty regions
separately - but that's incomplete at the moment and is also going to be
affected by folios, one way or another, since it deals with pages

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819615157.215744.17623791756928043114.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906917856.143852.8224898306177154573.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967124567.1823006.14188359004568060298.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021524705.640689.17824932021727663017.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fs.h        |  3 +++
 include/linux/fscache.h   | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/writeback.h |  1 +
 3 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbf812ce89a8..2c0b8e77d9ab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2418,6 +2418,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
  *			Used to detect that mark_inode_dirty() should not move
  * 			inode between dirty lists.
  *
+ * I_PINNING_FSCACHE_WB	Inode is pinning an fscache object for writeback.
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC		(1 << 0)
@@ -2440,6 +2442,7 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
 #define I_CREATING		(1 << 15)
 #define I_DONTCACHE		(1 << 16)
 #define I_SYNC_QUEUED		(1 << 17)
+#define I_PINNING_FSCACHE_WB	(1 << 18)
 
 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 9d469613e16c..18e725671594 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -16,6 +16,7 @@
 
 #include <linux/fs.h>
 #include <linux/netfs.h>
+#include <linux/writeback.h>
 
 #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE)
 #define __fscache_available (1)
@@ -566,4 +567,44 @@ static inline void fscache_write_to_cache(struct fscache_cookie *cookie,
 
 }
 
+#if __fscache_available
+extern int fscache_set_page_dirty(struct page *page, struct fscache_cookie *cookie);
+#else
+#define fscache_set_page_dirty(PAGE, COOKIE) (__set_page_dirty_nobuffers((PAGE)))
+#endif
+
+/**
+ * fscache_unpin_writeback - Unpin writeback resources
+ * @wbc: The writeback control
+ * @cookie: The cookie referring to the cache object
+ *
+ * Unpin the writeback resources pinned by fscache_set_page_dirty().  This is
+ * intended to be called by the netfs's ->write_inode() method.
+ */
+static inline void fscache_unpin_writeback(struct writeback_control *wbc,
+					   struct fscache_cookie *cookie)
+{
+	if (wbc->unpinned_fscache_wb)
+		fscache_unuse_cookie(cookie, NULL, NULL);
+}
+
+/**
+ * fscache_clear_inode_writeback - Clear writeback resources pinned by an inode
+ * @cookie: The cookie referring to the cache object
+ * @inode: The inode to clean up
+ * @aux: Auxiliary data to apply to the inode
+ *
+ * Clear any writeback resources held by an inode when the inode is evicted.
+ * This must be called before clear_inode() is called.
+ */
+static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie,
+						 struct inode *inode,
+						 const void *aux)
+{
+	if (inode->i_state & I_PINNING_FSCACHE_WB) {
+		loff_t i_size = i_size_read(inode);
+		fscache_unuse_cookie(cookie, aux, &i_size);
+	}
+}
+
 #endif /* _LINUX_FSCACHE_H */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3bfd487d1dd2..fec248ab1fec 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -68,6 +68,7 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
+	unsigned unpinned_fscache_wb:1;	/* Cleared I_PINNING_FSCACHE_WB */
 
 	/*
 	 * When writeback IOs are bounced through async layers, only the
-- 
cgit v1.2.3


From 1f67e6d0b18853c641d861a671f46a4964a88510 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 14:06:34 +0100
Subject: fscache: Provide a function to note the release of a page

Provide a function to be called from a network filesystem's releasepage
method to indicate that a page has been released that might have been a
reflection of data upon the server - and now that data must be reloaded
from the server or the cache.

This is used to end an optimisation for empty files, in particular files
that have just been created locally, whereby we know there cannot yet be
any data that we would need to read from the server or the cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819617128.215744.4725572296135656508.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906920354.143852.7511819614661372008.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967128061.1823006.611781655060034988.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021525963.640689.9264556596205140044.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 18e725671594..28ce258c1f87 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -607,4 +607,20 @@ static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie,
 	}
 }
 
+/**
+ * fscache_note_page_release - Note that a netfs page got released
+ * @cookie: The cookie corresponding to the file
+ *
+ * Note that a page that has been copied to the cache has been released.  This
+ * means that future reads will need to look in the cache to see if it's there.
+ */
+static inline
+void fscache_note_page_release(struct fscache_cookie *cookie)
+{
+	if (cookie &&
+	    test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) &&
+	    test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags))
+		clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From 16a96bdf92d5af06f9fa6a01a4b08e2fdfed2e5b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 20 Oct 2021 14:06:34 +0100
Subject: fscache: Provide a function to resize a cookie

Provide a function to change the size of the storage attached to a cookie,
to match the size of the file being cached when it's changed by truncate or
fallocate:

	void fscache_resize_cookie(struct fscache_cookie *cookie,
				   loff_t new_size);

This acts synchronously and is expected to run under the inode lock of the
caller.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819621839.215744.7895597119803515402.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906922387.143852.16394459879816147793.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967128998.1823006.10740669081985775576.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021527861.640689.3466382085497236267.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h |  4 ++++
 include/linux/fscache.h       | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 796c8b5c5305..3fa4902dc87c 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -64,6 +64,10 @@ struct fscache_cache_ops {
 	/* Withdraw an object without any cookie access counts held */
 	void (*withdraw_cookie)(struct fscache_cookie *cookie);
 
+	/* Change the size of a data object */
+	void (*resize_cookie)(struct netfs_cache_resources *cres,
+			      loff_t new_size);
+
 	/* Invalidate an object */
 	bool (*invalidate_cookie)(struct fscache_cookie *cookie);
 
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 28ce258c1f87..86b1c0db1de5 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -163,6 +163,7 @@ extern struct fscache_cookie *__fscache_acquire_cookie(
 extern void __fscache_use_cookie(struct fscache_cookie *, bool);
 extern void __fscache_unuse_cookie(struct fscache_cookie *, const void *, const loff_t *);
 extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool);
+extern void __fscache_resize_cookie(struct fscache_cookie *, loff_t);
 extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t, unsigned int);
 extern int __fscache_begin_read_operation(struct netfs_cache_resources *, struct fscache_cookie *);
 
@@ -366,6 +367,23 @@ void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data,
 		__fscache_update_cookie(cookie, aux_data, object_size);
 }
 
+/**
+ * fscache_resize_cookie - Request that a cache object be resized
+ * @cookie: The cookie representing the cache object
+ * @new_size: The new size of the object (may be NULL)
+ *
+ * Request that the size of an object be changed.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_resize_cookie(struct fscache_cookie *cookie, loff_t new_size)
+{
+	if (fscache_cookie_enabled(cookie))
+		__fscache_resize_cookie(cookie, new_size);
+}
+
 /**
  * fscache_invalidate - Notify cache that an object needs invalidation
  * @cookie: The cookie representing the cache object
-- 
cgit v1.2.3


From 1bd9c4e4f0494915b2391f373d25096579f835ff Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 18 Nov 2021 08:58:08 +0000
Subject: vfs, cachefiles: Mark a backing file in use with an inode flag

Use an inode flag, S_KERNEL_FILE, to mark that a backing file is in use by
the kernel to prevent cachefiles or other kernel services from interfering
with that file.

Alter rmdir to reject attempts to remove a directory marked with this flag.
This is used by cachefiles to prevent cachefilesd from removing them.

Using S_SWAPFILE instead isn't really viable as that has other effects in
the I/O paths.

Changes
=======
ver #3:
 - Check for the object pointer being NULL in the tracepoints rather than
   the caller.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819630256.215744.4815885535039369574.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906931596.143852.8642051223094013028.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967141000.1823006.12920680657559677789.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021541207.640689.564689725898537127.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c0b8e77d9ab..bcf1ca430139 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2249,6 +2249,7 @@ struct super_operations {
 #define S_ENCRYPTED	(1 << 14) /* Encrypted file (using fs/crypto/) */
 #define S_CASEFOLD	(1 << 15) /* Casefolded file */
 #define S_VERITY	(1 << 16) /* Verity file (using fs/verity/) */
+#define S_KERNEL_FILE	(1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
-- 
cgit v1.2.3


From 32e150037dce368d129996ffe5f98217b1974d9e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 14 Dec 2021 09:51:43 +0000
Subject: fscache, cachefiles: Store the volume coherency data

Store the volume coherency data in an xattr and check it when we rebind the
volume.  If it doesn't match the cache volume is moved to the graveyard and
rebuilt anew.

Changes
=======
ver #4:
 - Remove a couple of debugging prints.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/163967164397.1823006.2950539849831291830.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021563138.640689.15851092065380543119.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 86b1c0db1de5..7bd35f60d19a 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -87,6 +87,8 @@ struct fscache_volume {
 #define FSCACHE_VOLUME_COLLIDED_WITH	2	/* Volume was collided with */
 #define FSCACHE_VOLUME_ACQUIRE_PENDING	3	/* Volume is waiting to complete acquisition */
 #define FSCACHE_VOLUME_CREATING		4	/* Volume is being created on disk */
+	u8				coherency_len;	/* Length of the coherency data */
+	u8				coherency[];	/* Coherency data */
 };
 
 /*
-- 
cgit v1.2.3


From 3929eca769b5a231010b4978acc61c0735da198f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 21 Oct 2021 21:58:29 +0100
Subject: fscache, cachefiles: Display stats of no-space events

Add stat counters of no-space events that caused caching not to happen and
display in /proc/fs/fscache/stats.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819653216.215744.17210522251617386509.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906958369.143852.7257100711818401748.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967166917.1823006.14842444049198947892.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021566184.640689.4417328329632709265.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 3fa4902dc87c..007e47f38610 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -186,11 +186,17 @@ static inline void fscache_wait_for_objects(struct fscache_cache *cache)
 #ifdef CONFIG_FSCACHE_STATS
 extern atomic_t fscache_n_read;
 extern atomic_t fscache_n_write;
+extern atomic_t fscache_n_no_write_space;
+extern atomic_t fscache_n_no_create_space;
 #define fscache_count_read() atomic_inc(&fscache_n_read)
 #define fscache_count_write() atomic_inc(&fscache_n_write)
+#define fscache_count_no_write_space() atomic_inc(&fscache_n_no_write_space)
+#define fscache_count_no_create_space() atomic_inc(&fscache_n_no_create_space)
 #else
 #define fscache_count_read() do {} while(0)
 #define fscache_count_write() do {} while(0)
+#define fscache_count_no_write_space() do {} while(0)
+#define fscache_count_no_create_space() do {} while(0)
 #endif
 
 #endif /* _LINUX_FSCACHE_CACHE_H */
-- 
cgit v1.2.3


From 9f08ebc3438baaaefcc79654b330209b83397f17 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 22 Oct 2021 09:17:58 +0100
Subject: fscache, cachefiles: Display stat of culling events

Add a stat counter of culling events whereby the cache backend culls a file
to make space (when asked by cachefilesd in this case) and display in
/proc/fs/fscache/stats.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/163819654165.215744.3797804661644212436.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906961387.143852.9291157239960289090.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967168266.1823006.14436200166581605746.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021567619.640689.4339228906248763197.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache-cache.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 007e47f38610..a174cedf4d90 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -188,15 +188,18 @@ extern atomic_t fscache_n_read;
 extern atomic_t fscache_n_write;
 extern atomic_t fscache_n_no_write_space;
 extern atomic_t fscache_n_no_create_space;
+extern atomic_t fscache_n_culled;
 #define fscache_count_read() atomic_inc(&fscache_n_read)
 #define fscache_count_write() atomic_inc(&fscache_n_write)
 #define fscache_count_no_write_space() atomic_inc(&fscache_n_no_write_space)
 #define fscache_count_no_create_space() atomic_inc(&fscache_n_no_create_space)
+#define fscache_count_culled() atomic_inc(&fscache_n_culled)
 #else
 #define fscache_count_read() do {} while(0)
 #define fscache_count_write() do {} while(0)
 #define fscache_count_no_write_space() do {} while(0)
 #define fscache_count_no_create_space() do {} while(0)
+#define fscache_count_culled() do {} while(0)
 #endif
 
 #endif /* _LINUX_FSCACHE_CACHE_H */
-- 
cgit v1.2.3


From 2efd61a608b0039911924d2e5d7028eb37496e85 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 10 Dec 2021 16:36:20 +0000
Subject: KVM: Warn if mark_page_dirty() is called without an active vCPU

The various kvm_write_guest() and mark_page_dirty() functions must only
ever be called in the context of an active vCPU, because if dirty ring
tracking is enabled it may simply oops when kvm_get_running_vcpu()
returns NULL for the vcpu and then kvm_dirty_ring_get() dereferences it.

This oops was reported by "butt3rflyh4ck" <butterflyhuangxx@gmail.com> in
https://lore.kernel.org/kvm/CAFcO6XOmoS7EacN_n6v4Txk7xL7iqRa2gABg3F7E3Naf5uG94g@mail.gmail.com/

That actual bug will be fixed under separate cover but this warning
should help to prevent new ones from being added.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20211210163625.2886-2-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_dirty_ring.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
index 4da8d4a4140b..906f899813dc 100644
--- a/include/linux/kvm_dirty_ring.h
+++ b/include/linux/kvm_dirty_ring.h
@@ -43,11 +43,6 @@ static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring,
 	return 0;
 }
 
-static inline struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm)
-{
-	return NULL;
-}
-
 static inline int kvm_dirty_ring_reset(struct kvm *kvm,
 				       struct kvm_dirty_ring *ring)
 {
@@ -78,7 +73,6 @@ static inline bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
 
 u32 kvm_dirty_ring_get_rsvd_entries(void);
 int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size);
-struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm);
 
 /*
  * called with kvm->slots_lock held, returns the number of
-- 
cgit v1.2.3


From 982ed0de4753ed6e71dbd40f82a5a066baf133ed Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 10 Dec 2021 16:36:21 +0000
Subject: KVM: Reinstate gfn_to_pfn_cache with invalidation support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This can be used in two modes. There is an atomic mode where the cached
mapping is accessed while holding the rwlock, and a mode where the
physical address is used by a vCPU in guest mode.

For the latter case, an invalidation will wake the vCPU with the new
KVM_REQ_GPC_INVALIDATE, and the architecture will need to refresh any
caches it still needs to access before entering guest mode again.

Only one vCPU can be targeted by the wake requests; it's simple enough
to make it wake all vCPUs or even a mask but I don't see a use case for
that additional complexity right now.

Invalidation happens from the invalidate_range_start MMU notifier, which
needs to be able to sleep in order to wake the vCPU and wait for it.

This means that revalidation potentially needs to "wait" for the MMU
operation to complete and the invalidate_range_end notifier to be
invoked. Like the vCPU when it takes a page fault in that period, we
just spin — fixing that in a future patch by implementing an actual
*wait* may be another part of shaving this particularly hirsute yak.

As noted in the comments in the function itself, the only case where
the invalidate_range_start notifier is expected to be called *without*
being able to sleep is when the OOM reaper is killing the process. In
that case, we expect the vCPU threads already to have exited, and thus
there will be nothing to wake, and no reason to wait. So we clear the
KVM_REQUEST_WAIT bit and send the request anyway, then complain loudly
if there actually *was* anything to wake up.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20211210163625.2886-3-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h  | 103 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kvm_types.h |  18 ++++++++
 2 files changed, 121 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f9bbcf519280..9bbb1f1d9e48 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -155,6 +155,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_UNBLOCK           2
 #define KVM_REQ_UNHALT            3
 #define KVM_REQ_VM_DEAD           (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_GPC_INVALIDATE    (5 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQUEST_ARCH_BASE     8
 
 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
@@ -593,6 +594,10 @@ struct kvm {
 	unsigned long mn_active_invalidate_count;
 	struct rcuwait mn_memslots_update_rcuwait;
 
+	/* For management / invalidation of gfn_to_pfn_caches */
+	spinlock_t gpc_lock;
+	struct list_head gpc_list;
+
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
 	 * at the beginning of KVM_CREATE_VCPU.  online_vcpus is only
@@ -1099,6 +1104,104 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 			 unsigned long len);
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 
+/**
+ * kvm_gfn_to_pfn_cache_init - prepare a cached kernel mapping and HPA for a
+ *                             given guest physical address.
+ *
+ * @kvm:	   pointer to kvm instance.
+ * @gpc:	   struct gfn_to_pfn_cache object.
+ * @vcpu:	   vCPU to be used for marking pages dirty and to be woken on
+ *		   invalidation.
+ * @guest_uses_pa: indicates that the resulting host physical PFN is used while
+ *		   @vcpu is IN_GUEST_MODE so invalidations should wake it.
+ * @kernel_map:    requests a kernel virtual mapping (kmap / memremap).
+ * @gpa:	   guest physical address to map.
+ * @len:	   sanity check; the range being access must fit a single page.
+ * @dirty:         mark the cache dirty immediately.
+ *
+ * @return:	   0 for success.
+ *		   -EINVAL for a mapping which would cross a page boundary.
+ *                 -EFAULT for an untranslatable guest physical address.
+ *
+ * This primes a gfn_to_pfn_cache and links it into the @kvm's list for
+ * invalidations to be processed. Invalidation callbacks to @vcpu using
+ * %KVM_REQ_GPC_INVALIDATE will occur only for MMU notifiers, not for KVM
+ * memslot changes. Callers are required to use kvm_gfn_to_pfn_cache_check()
+ * to ensure that the cache is valid before accessing the target page.
+ */
+int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+			      struct kvm_vcpu *vcpu, bool guest_uses_pa,
+			      bool kernel_map, gpa_t gpa, unsigned long len,
+			      bool dirty);
+
+/**
+ * kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache.
+ *
+ * @kvm:	   pointer to kvm instance.
+ * @gpc:	   struct gfn_to_pfn_cache object.
+ * @gpa:	   current guest physical address to map.
+ * @len:	   sanity check; the range being access must fit a single page.
+ * @dirty:         mark the cache dirty immediately.
+ *
+ * @return:	   %true if the cache is still valid and the address matches.
+ *		   %false if the cache is not valid.
+ *
+ * Callers outside IN_GUEST_MODE context should hold a read lock on @gpc->lock
+ * while calling this function, and then continue to hold the lock until the
+ * access is complete.
+ *
+ * Callers in IN_GUEST_MODE may do so without locking, although they should
+ * still hold a read lock on kvm->scru for the memslot checks.
+ */
+bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+				gpa_t gpa, unsigned long len);
+
+/**
+ * kvm_gfn_to_pfn_cache_refresh - update a previously initialized cache.
+ *
+ * @kvm:	   pointer to kvm instance.
+ * @gpc:	   struct gfn_to_pfn_cache object.
+ * @gpa:	   updated guest physical address to map.
+ * @len:	   sanity check; the range being access must fit a single page.
+ * @dirty:         mark the cache dirty immediately.
+ *
+ * @return:	   0 for success.
+ *		   -EINVAL for a mapping which would cross a page boundary.
+ *                 -EFAULT for an untranslatable guest physical address.
+ *
+ * This will attempt to refresh a gfn_to_pfn_cache. Note that a successful
+ * returm from this function does not mean the page can be immediately
+ * accessed because it may have raced with an invalidation. Callers must
+ * still lock and check the cache status, as this function does not return
+ * with the lock still held to permit access.
+ */
+int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+				 gpa_t gpa, unsigned long len, bool dirty);
+
+/**
+ * kvm_gfn_to_pfn_cache_unmap - temporarily unmap a gfn_to_pfn_cache.
+ *
+ * @kvm:	   pointer to kvm instance.
+ * @gpc:	   struct gfn_to_pfn_cache object.
+ *
+ * This unmaps the referenced page and marks it dirty, if appropriate. The
+ * cache is left in the invalid state but at least the mapping from GPA to
+ * userspace HVA will remain cached and can be reused on a subsequent
+ * refresh.
+ */
+void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+
+/**
+ * kvm_gfn_to_pfn_cache_destroy - destroy and unlink a gfn_to_pfn_cache.
+ *
+ * @kvm:	   pointer to kvm instance.
+ * @gpc:	   struct gfn_to_pfn_cache object.
+ *
+ * This removes a cache from the @kvm's list to be processed on MMU notifier
+ * invocation.
+ */
+void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+
 void kvm_sigset_activate(struct kvm_vcpu *vcpu);
 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
 
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 888ef12862c9..dceac12c1ce5 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -19,6 +19,7 @@ struct kvm_memslots;
 enum kvm_mr_change;
 
 #include <linux/types.h>
+#include <linux/spinlock_types.h>
 
 #include <asm/kvm_types.h>
 
@@ -53,6 +54,23 @@ struct gfn_to_hva_cache {
 	struct kvm_memory_slot *memslot;
 };
 
+struct gfn_to_pfn_cache {
+	u64 generation;
+	gpa_t gpa;
+	unsigned long uhva;
+	struct kvm_memory_slot *memslot;
+	struct kvm_vcpu *vcpu;
+	struct list_head list;
+	rwlock_t lock;
+	void *khva;
+	kvm_pfn_t pfn;
+	bool active;
+	bool valid;
+	bool dirty;
+	bool kernel_map;
+	bool guest_uses_pa;
+};
+
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 /*
  * Memory caches are used to preallocate memory ahead of various MMU flows,
-- 
cgit v1.2.3


From 14243b387137a4afbe1df5d9dc15182d6657bb79 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 10 Dec 2021 16:36:23 +0000
Subject: KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel
 delivery

This adds basic support for delivering 2 level event channels to a guest.

Initially, it only supports delivery via the IRQ routing table, triggered
by an eventfd. In order to do so, it has a kvm_xen_set_evtchn_fast()
function which will use the pre-mapped shared_info page if it already
exists and is still valid, while the slow path through the irqfd_inject
workqueue will remap the shared_info page if necessary.

It sets the bits in the shared_info page but not the vcpu_info; that is
deferred to __kvm_xen_has_interrupt() which raises the vector to the
appropriate vCPU.

Add a 'verbose' mode to xen_shinfo_test while adding test cases for this.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20211210163625.2886-5-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9bbb1f1d9e48..3c47b146851a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -497,6 +497,12 @@ struct kvm_hv_sint {
 	u32 sint;
 };
 
+struct kvm_xen_evtchn {
+	u32 port;
+	u32 vcpu;
+	u32 priority;
+};
+
 struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
 	u32 type;
@@ -517,6 +523,7 @@ struct kvm_kernel_irq_routing_entry {
 		} msi;
 		struct kvm_s390_adapter_int adapter;
 		struct kvm_hv_sint hv_sint;
+		struct kvm_xen_evtchn xen_evtchn;
 	};
 	struct hlist_node link;
 };
-- 
cgit v1.2.3


From 44ea62813f0ab3d718de480504f4dfd0bdd01858 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 3 Sep 2021 18:31:40 -0700
Subject: spi: don't include ptp_clock_kernel.h in spi.h

Commit b42faeee718c ("spi: Add a PTP system timestamp
to the transfer structure") added an include of ptp_clock_kernel.h
to spi.h for struct ptp_system_timestamp but a forward declaration
is enough. Let's use that to limit the number of objects we have
to rebuild every time we touch networking headers.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20210904013140.2377609-1-kuba@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index eb7ac8a1e03c..7ab3fed7b804 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -14,12 +14,12 @@
 #include <linux/completion.h>
 #include <linux/scatterlist.h>
 #include <linux/gpio/consumer.h>
-#include <linux/ptp_clock_kernel.h>
 
 #include <uapi/linux/spi/spi.h>
 
 struct dma_chan;
 struct software_node;
+struct ptp_system_timestamp;
 struct spi_controller;
 struct spi_transfer;
 struct spi_controller_mem_ops;
-- 
cgit v1.2.3


From 3506659e18a61ae525f3b9b4f5af23b4b149d4db Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 28 Nov 2021 14:53:35 -0500
Subject: mm: Add unmap_mapping_folio()

Convert both callers of unmap_mapping_page() to call unmap_mapping_folio()
instead.  Also move zap_details from linux/mm.h to mm/memory.c

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/mm.h | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 145f045b0ddc..c9cdb26802fb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1825,28 +1825,6 @@ static inline bool can_do_mlock(void) { return false; }
 extern int user_shm_lock(size_t, struct ucounts *);
 extern void user_shm_unlock(size_t, struct ucounts *);
 
-/*
- * Parameter block passed down to zap_pte_range in exceptional cases.
- */
-struct zap_details {
-	struct address_space *zap_mapping;	/* Check page->mapping if set */
-	struct page *single_page;		/* Locked page to be unmapped */
-};
-
-/*
- * We set details->zap_mappings when we want to unmap shared but keep private
- * pages. Return true if skip zapping this page, false otherwise.
- */
-static inline bool
-zap_skip_check_mapping(struct zap_details *details, struct page *page)
-{
-	if (!details || !page)
-		return false;
-
-	return details->zap_mapping &&
-	    (details->zap_mapping != page_rmapping(page));
-}
-
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1892,7 +1870,6 @@ extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
 extern int fixup_user_fault(struct mm_struct *mm,
 			    unsigned long address, unsigned int fault_flags,
 			    bool *unlocked);
-void unmap_mapping_page(struct page *page);
 void unmap_mapping_pages(struct address_space *mapping,
 		pgoff_t start, pgoff_t nr, bool even_cows);
 void unmap_mapping_range(struct address_space *mapping,
@@ -1913,7 +1890,6 @@ static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
 	BUG();
 	return -EFAULT;
 }
-static inline void unmap_mapping_page(struct page *page) { }
 static inline void unmap_mapping_pages(struct address_space *mapping,
 		pgoff_t start, pgoff_t nr, bool even_cows) { }
 static inline void unmap_mapping_range(struct address_space *mapping,
-- 
cgit v1.2.3


From 1e84a3d997b74c33491899e31d48774f252213ab Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 2 Dec 2021 16:01:55 -0500
Subject: truncate,shmem: Add truncate_inode_folio()

Convert all callers of truncate_inode_page() to call
truncate_inode_folio() instead, and move the declaration to mm/internal.h.
Move the assertion that the caller is not passing in a tail page to
generic_error_remove_page().  We can't entirely remove the struct page
from the callers yet because the page pointer in the pvec might be a
shadow/dax/swap entry instead of actually a page.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c9cdb26802fb..d8b7d7ed14dd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1859,7 +1859,6 @@ extern void truncate_pagecache(struct inode *inode, loff_t new);
 extern void truncate_setsize(struct inode *inode, loff_t newsize);
 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
-int truncate_inode_page(struct address_space *mapping, struct page *page);
 int generic_error_remove_page(struct address_space *mapping, struct page *page);
 int invalidate_inode_page(struct page *page);
 
-- 
cgit v1.2.3


From 0e499ed3d7a216706e02eeded562627d3e69dcfd Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 1 Sep 2020 23:17:50 -0400
Subject: filemap: Return only folios from find_get_entries()

The callers have all been converted to work on folios, so convert
find_get_entries() to return a batch of folios instead of pages.
We also now return multiple large folios in a single call.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/pagemap.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index eb6e58e106c8..d2259a1da51c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -592,8 +592,6 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
 	return head + (index & (thp_nr_pages(head) - 1));
 }
 
-unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
-		pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
 unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
 			pgoff_t end, unsigned int nr_pages,
 			struct page **pages);
-- 
cgit v1.2.3


From 51dcbdac28d4dde915f78adf08bb3fac87f516e9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 7 Dec 2021 14:15:07 -0500
Subject: mm: Convert find_lock_entries() to use a folio_batch

find_lock_entries() already only returned the head page of folios, so
convert it to return a folio_batch instead of a pagevec.  That cascades
through converting truncate_inode_pages_range() to
delete_from_page_cache_batch() and page_cache_delete_batch().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/pagemap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d2259a1da51c..6e038811f4c8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -16,7 +16,7 @@
 #include <linux/hardirq.h> /* for in_interrupt() */
 #include <linux/hugetlb_inline.h>
 
-struct pagevec;
+struct folio_batch;
 
 static inline bool mapping_empty(struct address_space *mapping)
 {
@@ -936,7 +936,7 @@ static inline void __delete_from_page_cache(struct page *page, void *shadow)
 }
 void replace_page_cache_page(struct page *old, struct page *new);
 void delete_from_page_cache_batch(struct address_space *mapping,
-				  struct pagevec *pvec);
+				  struct folio_batch *fbatch);
 int try_to_release_page(struct page *page, gfp_t gfp);
 bool filemap_release_folio(struct folio *folio, gfp_t gfp);
 loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
-- 
cgit v1.2.3


From 1613fac9aaf840af76faa747ea428a714af98dbd Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 7 Dec 2021 14:28:49 -0500
Subject: mm: Remove pagevec_remove_exceptionals()

All of its callers now call folio_batch_remove_exceptionals().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/pagevec.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index c3fa616d7ae7..dda8d5868c81 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -27,7 +27,6 @@ struct pagevec {
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
-void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup_range(struct pagevec *pvec,
 			      struct address_space *mapping,
 			      pgoff_t *start, pgoff_t end);
@@ -146,8 +145,5 @@ static inline void folio_batch_release(struct folio_batch *fbatch)
 	pagevec_release((struct pagevec *)fbatch);
 }
 
-static inline void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
-{
-	pagevec_remove_exceptionals((struct pagevec *)fbatch);
-}
+void folio_batch_remove_exceptionals(struct folio_batch *fbatch);
 #endif /* _LINUX_PAGEVEC_H */
-- 
cgit v1.2.3


From 25a8de7f8d970ffa7263bd9d32a08138cd949f17 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 27 Aug 2021 07:21:49 -0400
Subject: XArray: Add xas_advance()

Add a new helper function to help iterate over multi-index entries.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/xarray.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index a91e3d90df8a..d6d5da6ed735 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1580,6 +1580,24 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
 	xas->xa_node = XAS_RESTART;
 }
 
+/**
+ * xas_advance() - Skip over sibling entries.
+ * @xas: XArray operation state.
+ * @index: Index of last sibling entry.
+ *
+ * Move the operation state to refer to the last sibling entry.
+ * This is useful for loops that normally want to see sibling
+ * entries but sometimes want to skip them.  Use xas_set() if you
+ * want to move to an index which is not part of this entry.
+ */
+static inline void xas_advance(struct xa_state *xas, unsigned long index)
+{
+	unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0;
+
+	xas->xa_index = index;
+	xas->xa_offset = (index >> shift) & XA_CHUNK_MASK;
+}
+
 /**
  * xas_set_order() - Set up XArray operation state for a multislot entry.
  * @xas: XArray operation state.
-- 
cgit v1.2.3


From 6b24ca4a1a8d4ee3221d6d44ddbb99f542e4bda3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 27 Jun 2020 22:19:08 -0400
Subject: mm: Use multi-index entries in the page cache

We currently store large folios as 2^N consecutive entries.  While this
consumes rather more memory than necessary, it also turns out to be buggy.
A writeback operation which starts within a tail page of a dirty folio will
not write back the folio as the xarray's dirty bit is only set on the
head index.  With multi-index entries, the dirty bit will be found no
matter where in the folio the operation starts.

This does end up simplifying the page cache slightly, although not as
much as I had hoped.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/pagemap.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 6e038811f4c8..704cb1b4b15d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1125,16 +1125,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
 		VM_BUG_ON_PAGE(PageTail(page), page);
 		array[i++] = page;
 		rac->_batch_count += thp_nr_pages(page);
-
-		/*
-		 * The page cache isn't using multi-index entries yet,
-		 * so the xas cursor needs to be manually moved to the
-		 * next index.  This can be removed once the page cache
-		 * is converted.
-		 */
-		if (PageHead(page))
-			xas_set(&xas, rac->_index + rac->_batch_count);
-
 		if (i == array_sz)
 			break;
 	}
-- 
cgit v1.2.3


From e32cf5dfbe227b355776948b2c9b5691b84d1cbd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 22 Dec 2021 22:10:09 -0600
Subject: kthread: Generalize pf_io_worker so it can point to struct kthread

The point of using set_child_tid to hold the kthread pointer was that
it already did what is necessary.  There are now restrictions on when
set_child_tid can be initialized and when set_child_tid can be used in
schedule_tail.  Which indicates that continuing to use set_child_tid
to hold the kthread pointer is a bad idea.

Instead of continuing to use the set_child_tid field of task_struct
generalize the pf_io_worker field of task_struct and use it to hold
the kthread pointer.

Rename pf_io_worker (which is a void * pointer) to worker_private so
it can be used to store kthreads struct kthread pointer.  Update the
kthread code to store the kthread pointer in the worker_private field.
Remove the places where set_child_tid had to be dealt with carefully
because kthreads also used it.

Link: https://lkml.kernel.org/r/CAHk-=wgtFAA9SbVYg0gR1tqPMC17-NYcs0GQkaYg1bGhh1uJQQ@mail.gmail.com
Link: https://lkml.kernel.org/r/87a6grvqy8.fsf_-_@email.froward.int.ebiederm.org
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78c351e35fec..52f2fdffa3ab 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -987,8 +987,8 @@ struct task_struct {
 	/* CLONE_CHILD_CLEARTID: */
 	int __user			*clear_child_tid;
 
-	/* PF_IO_WORKER */
-	void				*pf_io_worker;
+	/* PF_KTHREAD | PF_IO_WORKER */
+	void				*worker_private;
 
 	u64				utime;
 	u64				stime;
-- 
cgit v1.2.3


From 3367d1bd738c01b2737eaab7d922bfe5f1a41f38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 8 Jan 2022 16:31:58 +0100
Subject: power: supply: Provide stubs for charge_behaviour helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When CONFIG_SYSFS is not enabled provide stubs for the helper functions
to not break their callers.

Fixes: 539b9c94ac83 ("power: supply: add helpers for charge_behaviour sysfs")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20220108153158.189489-1-linux@weissschuh.net
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/power_supply.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 71f0379c2af8..f6b9ed4630fa 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -553,6 +553,21 @@ ssize_t power_supply_charge_behaviour_show(struct device *dev,
 					   char *buf);
 
 int power_supply_charge_behaviour_parse(unsigned int available_behaviours, const char *buf);
+#else
+static inline
+ssize_t power_supply_charge_behaviour_show(struct device *dev,
+					   unsigned int available_behaviours,
+					   enum power_supply_charge_behaviour behaviour,
+					   char *buf)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int power_supply_charge_behaviour_parse(unsigned int available_behaviours,
+						      const char *buf)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 #endif /* __LINUX_POWER_SUPPLY_H__ */
-- 
cgit v1.2.3


From 2f824d4d197e02275562359a2ae5274177ce500c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 8 Jan 2022 09:48:31 -0600
Subject: signal: Remove SIGNAL_GROUP_COREDUMP

After the previous cleanups "signal->core_state" is set whenever
SIGNAL_GROUP_COREDUMP is set and "signal->core_state" is tested
whenver the code wants to know if a coredump is in progress.  The
remaining tests of SIGNAL_GROUP_COREDUMP also test to see if
SIGNAL_GROUP_EXIT is set.  Similarly the only place that sets
SIGNAL_GROUP_COREDUMP also sets SIGNAL_GROUP_EXIT.

Which makes SIGNAL_GROUP_COREDUMP unecessary and redundant. So stop
setting SIGNAL_GROUP_COREDUMP, stop testing SIGNAL_GROUP_COREDUMP, and
remove it's definition.

With the setting of SIGNAL_GROUP_COREDUMP gone, coredump_finish no
longer needs to clear SIGNAL_GROUP_COREDUMP out of signal->flags
by setting SIGNAL_GROUP_EXIT.

Link: https://lkml.kernel.org/r/20211213225350.27481-5-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/signal.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index fa26d2a58413..ecc10e148799 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -256,7 +256,6 @@ struct signal_struct {
 #define SIGNAL_STOP_STOPPED	0x00000001 /* job control stop in effect */
 #define SIGNAL_STOP_CONTINUED	0x00000002 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT	0x00000004 /* group exit in progress */
-#define SIGNAL_GROUP_COREDUMP	0x00000008 /* coredump in progress */
 /*
  * Pending notifications to parent.
  */
@@ -272,7 +271,7 @@ struct signal_struct {
 static inline void signal_set_stop_flags(struct signal_struct *sig,
 					 unsigned int flags)
 {
-	WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP));
+	WARN_ON(sig->flags & SIGNAL_GROUP_EXIT);
 	sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
 }
 
-- 
cgit v1.2.3


From 60700e38fb68e800607ca7a027060d5419fc5798 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 6 Jun 2021 13:47:53 -0500
Subject: signal: Rename group_exit_task group_exec_task

The only remaining user of group_exit_task is exec.  Rename the field
so that it is clear which part of the code uses it.

Update the comment above the definition of group_exec_task to document
how it is currently used.

Link: https://lkml.kernel.org/r/20211213225350.27481-7-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/signal.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index ecc10e148799..d3248aba5183 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -109,13 +109,9 @@ struct signal_struct {
 
 	/* thread group exit support */
 	int			group_exit_code;
-	/* overloaded:
-	 * - notify group_exit_task when ->count is equal to notify_count
-	 * - everyone except group_exit_task is stopped during signal delivery
-	 *   of fatal signals, group_exit_task processes the signal.
-	 */
+	/* notify group_exec_task when notify_count is less or equal to 0 */
 	int			notify_count;
-	struct task_struct	*group_exit_task;
+	struct task_struct	*group_exec_task;
 
 	/* thread group stop support, overloads group_exit_code too */
 	int			group_stop_count;
@@ -275,11 +271,11 @@ static inline void signal_set_stop_flags(struct signal_struct *sig,
 	sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
 }
 
-/* If true, all threads except ->group_exit_task have pending SIGKILL */
+/* If true, all threads except ->group_exec_task have pending SIGKILL */
 static inline int signal_group_exit(const struct signal_struct *sig)
 {
 	return	(sig->flags & SIGNAL_GROUP_EXIT) ||
-		(sig->group_exit_task != NULL);
+		(sig->group_exec_task != NULL);
 }
 
 extern void flush_signals(struct task_struct *);
-- 
cgit v1.2.3


From 49697335e0b441b0553598c1b48ee9ebb053d2f1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 24 Jun 2021 02:14:30 -0500
Subject: signal: Remove the helper signal_group_exit

This helper is misleading.  It tests for an ongoing exec as well as
the process having received a fatal signal.

Sometimes it is appropriate to treat an on-going exec differently than
a process that is shutting down due to a fatal signal.  In particular
taking the fast path out of exit_signals instead of retargeting
signals is not appropriate during exec, and not changing the the exit
code in do_group_exit during exec.

Removing the helper makes it more obvious what is going on as both
cases must be coded for explicitly.

While removing the helper fix the two cases where I have observed
using signal_group_exit resulted in the wrong result.

In exit_signals only test for SIGNAL_GROUP_EXIT so that signals are
retargetted during an exec.

In do_group_exit use 0 as the exit code during an exec as de_thread
does not set group_exit_code.  As best as I can determine
group_exit_code has been is set to 0 most of the time during
de_thread.  During a thread group stop group_exit_code is set to the
stop signal and when the thread group receives SIGCONT group_exit_code
is reset to 0.

Link: https://lkml.kernel.org/r/20211213225350.27481-8-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/signal.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index d3248aba5183..b6ecb9fc4cd2 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -271,13 +271,6 @@ static inline void signal_set_stop_flags(struct signal_struct *sig,
 	sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
 }
 
-/* If true, all threads except ->group_exec_task have pending SIGKILL */
-static inline int signal_group_exit(const struct signal_struct *sig)
-{
-	return	(sig->flags & SIGNAL_GROUP_EXIT) ||
-		(sig->group_exec_task != NULL);
-}
-
 extern void flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
-- 
cgit v1.2.3


From 2d4bcf886e42f0f4846a3d9bdc3a90d278903a2e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 8 Jan 2022 11:23:02 -0600
Subject: exit: Remove profile_task_exit & profile_munmap

When I say remove I mean remove.  All profile_task_exit and
profile_munmap do is call a blocking notifier chain.  The helpers
profile_task_register and profile_task_unregister are not called
anywhere in the tree.  Which means this is all dead code.

So remove the dead code and make it easier to read do_exit.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lkml.kernel.org/r/20220103213312.9144-1-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/profile.h | 26 --------------------------
 1 file changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/profile.h b/include/linux/profile.h
index fd18ca96f557..f7eb2b57d890 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -31,11 +31,6 @@ static inline int create_proc_profile(void)
 }
 #endif
 
-enum profile_type {
-	PROFILE_TASK_EXIT,
-	PROFILE_MUNMAP
-};
-
 #ifdef CONFIG_PROFILING
 
 extern int prof_on __read_mostly;
@@ -66,23 +61,14 @@ static inline void profile_hit(int type, void *ip)
 struct task_struct;
 struct mm_struct;
 
-/* task is in do_exit() */
-void profile_task_exit(struct task_struct * task);
-
 /* task is dead, free task struct ? Returns 1 if
  * the task was taken, 0 if the task should be freed.
  */
 int profile_handoff_task(struct task_struct * task);
 
-/* sys_munmap */
-void profile_munmap(unsigned long addr);
-
 int task_handoff_register(struct notifier_block * n);
 int task_handoff_unregister(struct notifier_block * n);
 
-int profile_event_register(enum profile_type, struct notifier_block * n);
-int profile_event_unregister(enum profile_type, struct notifier_block * n);
-
 #else
 
 #define prof_on 0
@@ -117,19 +103,7 @@ static inline int task_handoff_unregister(struct notifier_block * n)
 	return -ENOSYS;
 }
 
-static inline int profile_event_register(enum profile_type t, struct notifier_block * n)
-{
-	return -ENOSYS;
-}
-
-static inline int profile_event_unregister(enum profile_type t, struct notifier_block * n)
-{
-	return -ENOSYS;
-}
-
-#define profile_task_exit(a) do { } while (0)
 #define profile_handoff_task(a) (0)
-#define profile_munmap(a) do { } while (0)
 
 #endif /* CONFIG_PROFILING */
 
-- 
cgit v1.2.3


From 2873cd31a20c25b5e763b35e5fb886f0938c6dd5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 8 Jan 2022 10:03:24 -0600
Subject: exit: Remove profile_handoff_task

All profile_handoff_task does is notify the task_free_notifier chain.
The helpers task_handoff_register and task_handoff_unregister are used
to add and delete entries from that chain and are never called.

So remove the dead code and make it much easier to read and reason
about __put_task_struct.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lkml.kernel.org/r/87fspyw6m0.fsf@email.froward.int.ebiederm.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/profile.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/profile.h b/include/linux/profile.h
index f7eb2b57d890..11db1ec516e2 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -61,14 +61,6 @@ static inline void profile_hit(int type, void *ip)
 struct task_struct;
 struct mm_struct;
 
-/* task is dead, free task struct ? Returns 1 if
- * the task was taken, 0 if the task should be freed.
- */
-int profile_handoff_task(struct task_struct * task);
-
-int task_handoff_register(struct notifier_block * n);
-int task_handoff_unregister(struct notifier_block * n);
-
 #else
 
 #define prof_on 0
@@ -93,17 +85,6 @@ static inline void profile_hit(int type, void *ip)
 	return;
 }
 
-static inline int task_handoff_register(struct notifier_block * n)
-{
-	return -ENOSYS;
-}
-
-static inline int task_handoff_unregister(struct notifier_block * n)
-{
-	return -ENOSYS;
-}
-
-#define profile_handoff_task(a) (0)
 
 #endif /* CONFIG_PROFILING */
 
-- 
cgit v1.2.3


From 4264178416cd52a55a3eccbefb3973866e060280 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Dec 2021 16:28:53 -0600
Subject: ptrace: Remove unused regs argument from ptrace_report_syscall

Link: https://lkml.kernel.org/r/20220103213312.9144-7-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/tracehook.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 2564b7434b4d..88c007ab5ebc 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -54,8 +54,7 @@ struct linux_binprm;
 /*
  * ptrace report for syscall entry and exit looks identical.
  */
-static inline int ptrace_report_syscall(struct pt_regs *regs,
-					unsigned long message)
+static inline int ptrace_report_syscall(unsigned long message)
 {
 	int ptrace = current->ptrace;
 
@@ -102,7 +101,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs,
 static inline __must_check int tracehook_report_syscall_entry(
 	struct pt_regs *regs)
 {
-	return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY);
+	return ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_ENTRY);
 }
 
 /**
@@ -127,7 +126,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 	if (step)
 		user_single_step_report(regs);
 	else
-		ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT);
+		ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_EXIT);
 }
 
 /**
-- 
cgit v1.2.3


From 40595cdc93edf4110c0f0c0b06f8d82008f23929 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 16 Dec 2021 12:20:13 -0500
Subject: nfs: block notification on fs with its own ->lock

NFSv4.1 supports an optional lock notification feature which notifies
the client when a lock comes available.  (Normally NFSv4 clients just
poll for locks if necessary.)  To make that work, we need to request a
blocking lock from the filesystem.

We turned that off for NFS in commit f657f8eef3ff ("nfs: don't atempt
blocking locks on nfs reexports") [sic] because it actually blocks the
nfsd thread while waiting for the lock.

Thanks to Vasily Averin for pointing out that NFS isn't the only
filesystem with that problem.

Any filesystem that leaves ->lock NULL will use posix_lock_file(), which
does the right thing.  Simplest is just to assume that any filesystem
that defines its own ->lock is not safe to request a blocking lock from.

So, this patch mostly reverts commit f657f8eef3ff ("nfs: don't atempt
blocking locks on nfs reexports") [sic] and commit b840be2f00c0 ("lockd:
don't attempt blocking locks on nfs reexports"), and instead uses a
check of ->lock (Vasily's suggestion) to decide whether to support
blocking lock notifications on a given filesystem.  Also add a little
documentation.

Perhaps someday we could add back an export flag later to allow
filesystems with "good" ->lock methods to support blocking lock
notifications.

Reported-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
[ cel: Description rewritten to address checkpatch nits ]
[ cel: Fixed warning when SUNRPC debugging is disabled ]
[ cel: Fixed NULL check ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Vasily Averin <vvs@virtuozzo.com>
---
 include/linux/exportfs.h    | 2 --
 include/linux/lockd/lockd.h | 9 +++++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 3260fe714846..fe848901fcc3 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -221,8 +221,6 @@ struct export_operations {
 #define EXPORT_OP_NOATOMIC_ATTR		(0x10) /* Filesystem cannot supply
 						  atomic attribute updates
 						*/
-#define EXPORT_OP_SYNC_LOCKS		(0x20) /* Filesystem can't do
-						  asychronous blocking locks */
 	unsigned long	flags;
 };
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index c4ae6506b8b3..fcef192e5e45 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -303,10 +303,15 @@ void		  nlmsvc_invalidate_all(void);
 int           nlmsvc_unlock_all_by_sb(struct super_block *sb);
 int           nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr);
 
+static inline struct file *nlmsvc_file_file(struct nlm_file *file)
+{
+	return file->f_file[O_RDONLY] ?
+	       file->f_file[O_RDONLY] : file->f_file[O_WRONLY];
+}
+
 static inline struct inode *nlmsvc_file_inode(struct nlm_file *file)
 {
-	return locks_inode(file->f_file[O_RDONLY] ?
-			   file->f_file[O_RDONLY] : file->f_file[O_WRONLY]);
+	return locks_inode(nlmsvc_file_file(file));
 }
 
 static inline int __nlm_privileged_request4(const struct sockaddr *sap)
-- 
cgit v1.2.3


From 0aa698787aa2a9e8840987e54ba2982559de6404 Mon Sep 17 00:00:00 2001
From: axelj <axelj@axis.com>
Date: Mon, 13 Dec 2021 08:09:25 +0100
Subject: tpm: Add Upgrade/Reduced mode support for TPM2 modules

If something went wrong during the TPM firmware upgrade, like power
failure or the firmware image file get corrupted, the TPM might end
up in Upgrade or Failure mode upon the next start. The state is
persistent between the TPM power cycle/restart.

According to TPM specification:
 * If the TPM is in Upgrade mode, it will answer with TPM2_RC_UPGRADE
   to all commands except TPM2_FieldUpgradeData(). It may also accept
   other commands if it is able to complete them using the previously
   installed firmware.
 * If the TPM is in Failure mode, it will allow performing TPM
   initialization but will not provide any crypto operations.
   Will happily respond to Field Upgrade calls.

Change the behavior of the tpm2_auto_startup(), so it detects the active
running mode of the TPM by adding the following checks.  If
tpm2_do_selftest() call returns TPM2_RC_UPGRADE, the TPM is in Upgrade
mode.
If the TPM is in Failure mode, it will successfully respond to both
tpm2_do_selftest() and tpm2_startup() calls. Although, will fail to
answer to tpm2_get_cc_attrs_tbl(). Use this fact to conclude that TPM is
in Failure mode.

If detected that the TPM is in the Upgrade or Failure mode, the function
sets TPM_CHIP_FLAG_FIRMWARE_UPGRADE_MODE flag.

The TPM_CHIP_FLAG_FIRMWARE_UPGRADE_MODE flag is used later during driver
initialization/deinitialization to disable functionality which makes no
sense or will fail in the current TPM state. Following functionality is
affected:
 * Do not register TPM as a hwrng
 * Do not register sysfs entries which provide information impossible to
   obtain in limited mode
 * Do not register resource managed character device

Signed-off-by: axelj <axelj@axis.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/tpm.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 12d827734686..dfeb25a0362d 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -207,6 +207,7 @@ enum tpm2_return_codes {
 	TPM2_RC_INITIALIZE	= 0x0100, /* RC_VER1 */
 	TPM2_RC_FAILURE		= 0x0101,
 	TPM2_RC_DISABLED	= 0x0120,
+	TPM2_RC_UPGRADE		= 0x012D,
 	TPM2_RC_COMMAND_CODE    = 0x0143,
 	TPM2_RC_TESTING		= 0x090A, /* RC_WARN */
 	TPM2_RC_REFERENCE_H0	= 0x0910,
@@ -278,6 +279,7 @@ enum tpm_chip_flags {
 	TPM_CHIP_FLAG_HAVE_TIMEOUTS	= BIT(4),
 	TPM_CHIP_FLAG_ALWAYS_POWERED	= BIT(5),
 	TPM_CHIP_FLAG_FIRMWARE_POWER_MANAGED	= BIT(6),
+	TPM_CHIP_FLAG_FIRMWARE_UPGRADE	= BIT(7),
 };
 
 #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev)
@@ -399,6 +401,14 @@ static inline void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value)
 	tpm_buf_append(buf, (u8 *) &value2, 4);
 }
 
+/*
+ * Check if TPM device is in the firmware upgrade mode.
+ */
+static inline bool tpm_is_firmware_upgrade(struct tpm_chip *chip)
+{
+	return chip->flags & TPM_CHIP_FLAG_FIRMWARE_UPGRADE;
+}
+
 static inline u32 tpm2_rc_value(u32 rc)
 {
 	return (rc & BIT(7)) ? rc & 0xff : rc;
-- 
cgit v1.2.3


From 292c33c95defd0b814fec1fc8cd60d16556cf7b8 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Fri, 7 Jan 2022 08:52:28 +0800
Subject: block: fix old-style declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the 'inline' keyword to the front of 'void'.

Remove a warning found by clang(make W=1 LLVM=1)
./include/linux/blk-mq.h:259:1: warning: ‘inline’ is not at beginning of
declaration

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220107005228.103927-1-yang.lee@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f40a05ecca4a..d319ffa59354 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -256,7 +256,7 @@ static inline unsigned short req_get_ioprio(struct request *req)
  * @rq: The request to move
  * @prev: The request preceding @rq in @src (NULL if @rq is the head)
  */
-static void inline rq_list_move(struct request **src, struct request **dst,
+static inline void rq_list_move(struct request **src, struct request **dst,
 				struct request *rq, struct request *prev)
 {
 	if (prev)
-- 
cgit v1.2.3


From 0ea9fc15b1d7d6636d429e74ffe3f86bf2f2f7d6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 23 Nov 2021 17:05:07 +0100
Subject: fs/locks: fix fcntl_getlk64/fcntl_setlk64 stub prototypes

My patch to rework oabi fcntl64() introduced a harmless
sparse warning when file locking is disabled:

   arch/arm/kernel/sys_oabi-compat.c:251:51: sparse: sparse: incorrect type in argument 3 (different address spaces) @@     expected struct flock64 [noderef] __user *user @@     got struct flock64 * @@
   arch/arm/kernel/sys_oabi-compat.c:251:51: sparse:     expected struct flock64 [noderef] __user *user
   arch/arm/kernel/sys_oabi-compat.c:251:51: sparse:     got struct flock64 *
   arch/arm/kernel/sys_oabi-compat.c:265:55: sparse: sparse: incorrect type in argument 4 (different address spaces) @@     expected struct flock64 [noderef] __user *user @@     got struct flock64 * @@
   arch/arm/kernel/sys_oabi-compat.c:265:55: sparse:     expected struct flock64 [noderef] __user *user
   arch/arm/kernel/sys_oabi-compat.c:265:55: sparse:     got struct flock64 *

When file locking is enabled, everything works correctly and the
right data gets passed, but the stub declarations in linux/fs.h
did not get modified when the calling conventions changed in an
earlier patch.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: 7e2d8c29ecdd ("ARM: 9111/1: oabi-compat: rework fcntl64() emulation")
Fixes: a75d30c77207 ("fs/locks: pass kernel struct flock to fcntl_getlk/setlk")
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbf812ce89a8..5122d13775c2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1220,13 +1220,13 @@ static inline int fcntl_setlk(unsigned int fd, struct file *file,
 
 #if BITS_PER_LONG == 32
 static inline int fcntl_getlk64(struct file *file, unsigned int cmd,
-				struct flock64 __user *user)
+				struct flock64 *user)
 {
 	return -EINVAL;
 }
 
 static inline int fcntl_setlk64(unsigned int fd, struct file *file,
-				unsigned int cmd, struct flock64 __user *user)
+				unsigned int cmd, struct flock64 *user)
 {
 	return -EACCES;
 }
-- 
cgit v1.2.3


From 719774377622bc4025d2a74f551b5dc2158c6c30 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 7 Jan 2022 05:03:22 +0100
Subject: netfilter: conntrack: convert to refcount_t api

Convert nf_conn reference counting from atomic_t to refcount_t based api.
refcount_t api provides more runtime sanity checks and will warn on
certain constructs, e.g. refcount_inc() on a zero reference count, which
usually indicates use-after-free.

For this reason template allocation is changed to init the refcount to
1, the subsequenct add operations are removed.

Likewise, init_conntrack() is changed to set the initial refcount to 1
instead refcount_inc().

This is safe because the new entry is not (yet) visible to other cpus.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_common.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 700ea077ce2d..a03f7a80b9ab 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -2,7 +2,7 @@
 #ifndef _NF_CONNTRACK_COMMON_H
 #define _NF_CONNTRACK_COMMON_H
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <uapi/linux/netfilter/nf_conntrack_common.h>
 
 struct ip_conntrack_stat {
@@ -25,19 +25,19 @@ struct ip_conntrack_stat {
 #define NFCT_PTRMASK	~(NFCT_INFOMASK)
 
 struct nf_conntrack {
-	atomic_t use;
+	refcount_t use;
 };
 
 void nf_conntrack_destroy(struct nf_conntrack *nfct);
 static inline void nf_conntrack_put(struct nf_conntrack *nfct)
 {
-	if (nfct && atomic_dec_and_test(&nfct->use))
+	if (nfct && refcount_dec_and_test(&nfct->use))
 		nf_conntrack_destroy(nfct);
 }
 static inline void nf_conntrack_get(struct nf_conntrack *nfct)
 {
 	if (nfct)
-		atomic_inc(&nfct->use);
+		refcount_inc(&nfct->use);
 }
 
 #endif /* _NF_CONNTRACK_COMMON_H */
-- 
cgit v1.2.3


From 3fce16493dc1aa2c9af3d7e7bd360dfe203a3e6a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 7 Jan 2022 05:03:23 +0100
Subject: netfilter: core: move ip_ct_attach indirection to struct nf_ct_hook

ip_ct_attach predates struct nf_ct_hook, we can place it there and
remove the exported symbol.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 3fda1a508733..e0e3f3355ab1 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -440,7 +440,6 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <linux/netfilter/nf_conntrack_zones_common.h>
 
-extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu;
 void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
 struct nf_conntrack_tuple;
 bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
@@ -463,6 +462,7 @@ struct nf_ct_hook {
 	void (*destroy)(struct nf_conntrack *);
 	bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
 			      const struct sk_buff *);
+	void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
 };
 extern struct nf_ct_hook __rcu *nf_ct_hook;
 
-- 
cgit v1.2.3


From 285c8a7a58158cb1805c97ff03875df2ba2ea1fe Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 7 Jan 2022 05:03:24 +0100
Subject: netfilter: make function op structures const

No functional changes, these structures should be const.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index e0e3f3355ab1..15e71bfff726 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -381,13 +381,13 @@ struct nf_nat_hook {
 				  enum ip_conntrack_dir dir);
 };
 
-extern struct nf_nat_hook __rcu *nf_nat_hook;
+extern const struct nf_nat_hook __rcu *nf_nat_hook;
 
 static inline void
 nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 {
 #if IS_ENABLED(CONFIG_NF_NAT)
-	struct nf_nat_hook *nat_hook;
+	const struct nf_nat_hook *nat_hook;
 
 	rcu_read_lock();
 	nat_hook = rcu_dereference(nf_nat_hook);
@@ -464,7 +464,7 @@ struct nf_ct_hook {
 			      const struct sk_buff *);
 	void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
 };
-extern struct nf_ct_hook __rcu *nf_ct_hook;
+extern const struct nf_ct_hook __rcu *nf_ct_hook;
 
 struct nlattr;
 
@@ -479,7 +479,7 @@ struct nfnl_ct_hook {
 	void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct,
 			   enum ip_conntrack_info ctinfo, s32 off);
 };
-extern struct nfnl_ct_hook __rcu *nfnl_ct_hook;
+extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook;
 
 /**
  * nf_skb_duplicated - TEE target has sent a packet
-- 
cgit v1.2.3


From 6ae7989c9af0d98ab64196f4f4c6f6499454bd23 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 7 Jan 2022 05:03:25 +0100
Subject: netfilter: conntrack: avoid useless indirection during conntrack
 destruction

nf_ct_put() results in a usesless indirection:

nf_ct_put -> nf_conntrack_put -> nf_conntrack_destroy -> rcu readlock +
indirect call of ct_hooks->destroy().

There are two _put helpers:
nf_ct_put and nf_conntrack_put.  The latter is what should be used in
code that MUST NOT cause a linker dependency on the conntrack module
(e.g. calls from core network stack).

Everyone else should call nf_ct_put() instead.

A followup patch will convert a few nf_conntrack_put() calls to
nf_ct_put(), in particular from modules that already have a conntrack
dependency such as act_ct or even nf_conntrack itself.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_common.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index a03f7a80b9ab..2770db2fa080 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -29,6 +29,8 @@ struct nf_conntrack {
 };
 
 void nf_conntrack_destroy(struct nf_conntrack *nfct);
+
+/* like nf_ct_put, but without module dependency on nf_conntrack */
 static inline void nf_conntrack_put(struct nf_conntrack *nfct)
 {
 	if (nfct && refcount_dec_and_test(&nfct->use))
-- 
cgit v1.2.3


From 6316136ec6e3dd1c302f7e7289a9ee46ecc610ae Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 7 Jan 2022 15:46:16 +0100
Subject: netfilter: egress: avoid a lockdep splat

include/linux/netfilter_netdev.h:97 suspicious rcu_dereference_check() usage!
2 locks held by sd-resolve/1100:
 0: ..(rcu_read_lock_bh){1:3}, at: ip_finish_output2
 1: ..(rcu_read_lock_bh){1:3}, at: __dev_queue_xmit
 __dev_queue_xmit+0 ..

The helper has two callers, one uses rcu_read_lock, the other
rcu_read_lock_bh().  Annotate the dereference to reflect this.

Fixes: 42df6e1d221dd ("netfilter: Introduce egress hook")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_netdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h
index b71b57a83bb4..b4dd96e4dc8d 100644
--- a/include/linux/netfilter_netdev.h
+++ b/include/linux/netfilter_netdev.h
@@ -94,7 +94,7 @@ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
 		return skb;
 #endif
 
-	e = rcu_dereference(dev->nf_hooks_egress);
+	e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held());
 	if (!e)
 		return skb;
 
-- 
cgit v1.2.3


From 6f022c2ddbcefaee79502ce5386dfe351d457070 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@nvidia.com>
Date: Thu, 6 Jan 2022 17:38:04 +0200
Subject: net: openvswitch: Fix ct_state nat flags for conns arriving from tc

Netfilter conntrack maintains NAT flags per connection indicating
whether NAT was configured for the connection. Openvswitch maintains
NAT flags on the per packet flow key ct_state field, indicating
whether NAT was actually executed on the packet.

When a packet misses from tc to ovs the conntrack NAT flags are set.
However, NAT was not necessarily executed on the packet because the
connection's state might still be in NEW state. As such, openvswitch
wrongly assumes that NAT was executed and sets an incorrect flow key
NAT flags.

Fix this, by flagging to openvswitch which NAT was actually done in
act_ct via tc_skb_ext and tc_skb_cb to the openvswitch module, so
the packet flow key NAT flags will be correctly set.

Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Signed-off-by: Paul Blakey <paulb@nvidia.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://lore.kernel.org/r/20220106153804.26451-1-paulb@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4507d77d6941..60ab0c2fe567 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -287,7 +287,9 @@ struct tc_skb_ext {
 	__u32 chain;
 	__u16 mru;
 	__u16 zone;
-	bool post_ct;
+	u8 post_ct:1;
+	u8 post_ct_snat:1;
+	u8 post_ct_dnat:1;
 };
 #endif
 
-- 
cgit v1.2.3


From c504e5c2f9648a1e5c2be01e8c3f59d394192bd3 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Sun, 9 Jan 2022 14:36:26 +0800
Subject: net: skb: introduce kfree_skb_reason()

Introduce the interface kfree_skb_reason(), which is able to pass
the reason why the skb is dropped to 'kfree_skb' tracepoint.

Add the 'reason' field to 'trace_kfree_skb', therefor user can get
more detail information about abnormal skb with 'drop_monitor' or
eBPF.

All drop reasons are defined in the enum 'skb_drop_reason', and
they will be print as string in 'kfree_skb' tracepoint in format
of 'reason: XXX'.

( Maybe the reasons should be defined in a uapi header file, so that
user space can use them? )

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 642acb0d1646..ef0870abc791 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -305,6 +305,17 @@ struct sk_buff_head {
 
 struct sk_buff;
 
+/* The reason of skb drop, which is used in kfree_skb_reason().
+ * en...maybe they should be splited by group?
+ *
+ * Each item here should also be in 'TRACE_SKB_DROP_REASON', which is
+ * used to translate the reason to string.
+ */
+enum skb_drop_reason {
+	SKB_DROP_REASON_NOT_SPECIFIED,
+	SKB_DROP_REASON_MAX,
+};
+
 /* To allow 64K frame to be packed as single skb without frag_list we
  * require 64K/PAGE_SIZE pages plus 1 additional page to allow for
  * buffers which do not start on a page boundary.
@@ -1085,8 +1096,18 @@ static inline bool skb_unref(struct sk_buff *skb)
 	return true;
 }
 
+void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason);
+
+/**
+ *	kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason
+ *	@skb: buffer to free
+ */
+static inline void kfree_skb(struct sk_buff *skb)
+{
+	kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+}
+
 void skb_release_head_state(struct sk_buff *skb);
-void kfree_skb(struct sk_buff *skb);
 void kfree_skb_list(struct sk_buff *segs);
 void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt);
 void skb_tx_error(struct sk_buff *skb);
-- 
cgit v1.2.3


From 85125597419aec3aa7b8f3b8713e415f997796f2 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Sun, 9 Jan 2022 14:36:27 +0800
Subject: net: skb: use kfree_skb_reason() in tcp_v4_rcv()

Replace kfree_skb() with kfree_skb_reason() in tcp_v4_rcv(). Following
drop reasons are added:

SKB_DROP_REASON_NO_SOCKET
SKB_DROP_REASON_PKT_TOO_SMALL
SKB_DROP_REASON_TCP_CSUM
SKB_DROP_REASON_TCP_FILTER

After this patch, 'kfree_skb' event will print message like this:

$           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
$              | |         |   |||||     |         |
          <idle>-0       [000] ..s1.    36.113438: kfree_skb: skbaddr=(____ptrval____) protocol=2048 location=(____ptrval____) reason: NO_SOCKET

The reason of skb drop is printed too.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ef0870abc791..c9c97b0d0fe9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -313,6 +313,10 @@ struct sk_buff;
  */
 enum skb_drop_reason {
 	SKB_DROP_REASON_NOT_SPECIFIED,
+	SKB_DROP_REASON_NO_SOCKET,
+	SKB_DROP_REASON_PKT_TOO_SMALL,
+	SKB_DROP_REASON_TCP_CSUM,
+	SKB_DROP_REASON_TCP_FILTER,
 	SKB_DROP_REASON_MAX,
 };
 
-- 
cgit v1.2.3


From 1c7fab70df085d866a3765955f397ca2b4025b15 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Sun, 9 Jan 2022 14:36:28 +0800
Subject: net: skb: use kfree_skb_reason() in __udp4_lib_rcv()

Replace kfree_skb() with kfree_skb_reason() in __udp4_lib_rcv.
New drop reason 'SKB_DROP_REASON_UDP_CSUM' is added for udp csum
error.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c9c97b0d0fe9..af64c7de9b53 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -317,6 +317,7 @@ enum skb_drop_reason {
 	SKB_DROP_REASON_PKT_TOO_SMALL,
 	SKB_DROP_REASON_TCP_CSUM,
 	SKB_DROP_REASON_TCP_FILTER,
+	SKB_DROP_REASON_UDP_CSUM,
 	SKB_DROP_REASON_MAX,
 };
 
-- 
cgit v1.2.3


From a6b5a28eb56c3f4988f7ff5290b954ba296e309a Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Sat, 14 Nov 2020 13:43:54 -0500
Subject: nfs: Convert to new fscache volume/cookie API

Change the nfs filesystem to support fscache's indexing rewrite and
reenable caching in nfs.

The following changes have been made:

 (1) The fscache_netfs struct is no more, and there's no need to register
     the filesystem as a whole.

 (2) The session cookie is now an fscache_volume cookie, allocated with
     fscache_acquire_volume().  That takes three parameters: a string
     representing the "volume" in the index, a string naming the cache to
     use (or NULL) and a u64 that conveys coherency metadata for the
     volume.

     For nfs, I've made it render the volume name string as:

        "nfs,<ver>,<family>,<address>,<port>,<fsidH>,<fsidL>*<,param>[,<uniq>]"

 (3) The fscache_cookie_def is no more and needed information is passed
     directly to fscache_acquire_cookie().  The cache no longer calls back
     into the filesystem, but rather metadata changes are indicated at
     other times.

     fscache_acquire_cookie() is passed the same keying and coherency
     information as before.

 (4) fscache_enable/disable_cookie() have been removed.

     Call fscache_use_cookie() and fscache_unuse_cookie() when a file is
     opened or closed to prevent a cache file from being culled and to keep
     resources to hand that are needed to do I/O.

     If a file is opened for writing, we invalidate it with
     FSCACHE_INVAL_DIO_WRITE in lieu of doing writeback to the cache,
     thereby making it cease caching until all currently open files are
     closed.  This should give the same behaviour as the uptream code.
     Making the cache store local modifications isn't straightforward for
     NFS, so that's left for future patches.

 (5) fscache_invalidate() now needs to be given uptodate auxiliary data and
     a file size.  It also takes a flag to indicate if this was due to a
     DIO write.

 (6) Call nfs_fscache_invalidate() with FSCACHE_INVAL_DIO_WRITE on a file
     to which a DIO write is made.

 (7) Call fscache_note_page_release() from nfs_release_page().

 (8) Use a killable wait in nfs_vm_page_mkwrite() when waiting for
     PG_fscache to be cleared.

 (9) The functions to read and write data to/from the cache are stubbed out
     pending a conversion to use netfslib.

Changes
=======
ver #3:
 - Added missing =n fallback for nfs_fscache_release_file()[1][2].

ver #2:
 - Use gfpflags_allow_blocking() rather than using flag directly.
 - fscache_acquire_volume() now returns errors.
 - Remove NFS_INO_FSCACHE as it's no longer used.
 - Need to unuse a cookie on file-release, not inode-clear.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Co-developed-by: David Howells <dhowells@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Acked-by: Jeff Layton <jlayton@kernel.org>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: Anna Schumaker <anna.schumaker@netapp.com>
cc: linux-nfs@vger.kernel.org
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/202112100804.nksO8K4u-lkp@intel.com/ [1]
Link: https://lore.kernel.org/r/202112100957.2oEDT20W-lkp@intel.com/ [2]
Link: https://lore.kernel.org/r/163819668938.215744.14448852181937731615.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906979003.143852.2601189243864854724.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967182112.1823006.7791504655391213379.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021575950.640689.12069642327533368467.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/nfs_fs.h    | 1 -
 include/linux/nfs_fs_sb.h | 9 ++-------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 05f249f20f55..00835bacd236 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -275,7 +275,6 @@ struct nfs4_copy_state {
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_INVALIDATING	(3)		/* inode is being invalidated */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
-#define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
 #define NFS_INO_FORCE_READDIR	(7)		/* force readdirplus */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 2a9acbfe00f0..77b2dba27bbb 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -120,11 +120,6 @@ struct nfs_client {
 	 * This is used to generate the mv0 callback address.
 	 */
 	char			cl_ipaddr[48];
-
-#ifdef CONFIG_NFS_FSCACHE
-	struct fscache_cookie	*fscache;	/* client index cache cookie */
-#endif
-
 	struct net		*cl_net;
 	struct list_head	pending_cb_stateids;
 };
@@ -194,8 +189,8 @@ struct nfs_server {
 	struct nfs_auth_info	auth_info;	/* parsed auth flavors */
 
 #ifdef CONFIG_NFS_FSCACHE
-	struct nfs_fscache_key	*fscache_key;	/* unique key for superblock */
-	struct fscache_cookie	*fscache;	/* superblock cookie */
+	struct fscache_volume	*fscache;	/* superblock cookie */
+	char			*fscache_uniq;	/* Uniquifier (or NULL) */
 #endif
 
 	u32			pnfs_blksize;	/* layout_blksize attr */
-- 
cgit v1.2.3


From 16f2f4e679cfdaa9552574484f104014908a76c6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 27 Aug 2021 15:19:34 +0100
Subject: nfs: Implement cache I/O by accessing the cache directly

Move NFS to using fscache DIO API instead of the old upstream I/O API as
that has been removed.  This is a stopgap solution as the intention is that
at sometime in the future, the cache will move to using larger blocks and
won't be able to store individual pages in order to deal with the potential
for data corruption due to the backing filesystem being able insert/remove
bridging blocks of zeros into its extent list[1].

NFS then reads and writes cache pages synchronously and one page at a time.

The preferred change would be to use the netfs lib, but the new I/O API can
be used directly.  It's just that as the cache now needs to track data for
itself, caching blocks may exceed page size...

This code is somewhat borrowed from my "fallback I/O" patchset[2].

Changes
=======
ver #3:
 - Restore lost =n fallback for nfs_fscache_release_page()[2].

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Acked-by: Jeff Layton <jlayton@kernel.org>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: Anna Schumaker <anna.schumaker@netapp.com>
cc: linux-nfs@vger.kernel.org
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/YO17ZNOcq+9PajfQ@mit.edu [1]
Link: https://lore.kernel.org/r/202112100957.2oEDT20W-lkp@intel.com/ [2]
Link: https://lore.kernel.org/r/163189108292.2509237.12615909591150927232.stgit@warthog.procyon.org.uk/ [2]
Link: https://lore.kernel.org/r/163906981318.143852.17220018647843475985.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967184451.1823006.6450645559828329590.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021577632.640689.11069627070150063812.stgit@warthog.procyon.org.uk/ # v4
---
 include/linux/fscache.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 7bd35f60d19a..ede50406bcb0 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -168,6 +168,7 @@ extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool);
 extern void __fscache_resize_cookie(struct fscache_cookie *, loff_t);
 extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t, unsigned int);
 extern int __fscache_begin_read_operation(struct netfs_cache_resources *, struct fscache_cookie *);
+extern int __fscache_begin_write_operation(struct netfs_cache_resources *, struct fscache_cookie *);
 
 extern void __fscache_write_to_cache(struct fscache_cookie *, struct address_space *,
 				     loff_t, size_t, loff_t, netfs_io_terminated_t, void *,
@@ -499,6 +500,33 @@ int fscache_read(struct netfs_cache_resources *cres,
 			 term_func, term_func_priv);
 }
 
+/**
+ * fscache_begin_write_operation - Begin a write operation for the netfs lib
+ * @cres: The cache resources for the write being performed
+ * @cookie: The cookie representing the cache object
+ *
+ * Begin a write operation on behalf of the netfs helper library.  @cres
+ * indicates the cache resources to which the operation state should be
+ * attached; @cookie indicates the cache object that will be accessed.
+ *
+ * @cres->inval_counter is set from @cookie->inval_counter for comparison at
+ * the end of the operation.  This allows invalidation during the operation to
+ * be detected by the caller.
+ *
+ * Returns:
+ * * 0		- Success
+ * * -ENOBUFS	- No caching available
+ * * Other error code from the cache, such as -ENOMEM.
+ */
+static inline
+int fscache_begin_write_operation(struct netfs_cache_resources *cres,
+				  struct fscache_cookie *cookie)
+{
+	if (fscache_cookie_enabled(cookie))
+		return __fscache_begin_write_operation(cres, cookie);
+	return -ENOBUFS;
+}
+
 /**
  * fscache_write - Start a write to the cache.
  * @cres: The cache resources to use
-- 
cgit v1.2.3


From a59466ee91aaa9d43889a4c51e01de087d188448 Mon Sep 17 00:00:00 2001
From: Karolina Drobnik <karolinadrobnik@gmail.com>
Date: Tue, 11 Jan 2022 10:28:47 +0000
Subject: memblock: Remove #ifdef __KERNEL__ from memblock.h

memblock.h is not a uAPI header, so __KERNEL__ guard can be deleted

Signed-off-by: Karolina Drobnik <karolinadrobnik@gmail.com>
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Link: https://lore.kernel.org/r/20220111102847.673746-1-karolinadrobnik@gmail.com
---
 include/linux/memblock.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 9dc7cb239d21..50ad19662a32 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 #ifndef _LINUX_MEMBLOCK_H
 #define _LINUX_MEMBLOCK_H
-#ifdef __KERNEL__
 
 /*
  * Logical memory blocks.
@@ -605,6 +604,5 @@ static inline void early_memtest(phys_addr_t start, phys_addr_t end)
 }
 #endif
 
-#endif /* __KERNEL__ */
 
 #endif /* _LINUX_MEMBLOCK_H */
-- 
cgit v1.2.3


From 500b55b05d0a21c4adddf4c3b29ee6f32b502046 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 21 Dec 2021 10:45:07 -0600
Subject: PCI: Work around Intel I210 ROM BAR overlap defect

Per PCIe r5, sec 7.5.1.2.4, a device must not claim accesses to its
Expansion ROM unless both the Memory Space Enable and the Expansion ROM
Enable bit are set.  But apparently some Intel I210 NICs don't work
correctly if the ROM BAR overlaps another BAR, even if the Expansion ROM is
disabled.

Michael reported that on a Kontron SMARC-sAL28 ARM64 system with U-Boot
v2021.01-rc3, the ROM BAR overlaps BAR 3, and networking doesn't work at
all:

  BAR 0: 0x40000000 (32-bit, non-prefetchable) [size=1M]
  BAR 3: 0x40200000 (32-bit, non-prefetchable) [size=16K]
  ROM:   0x40200000 (disabled) [size=1M]

  NETDEV WATCHDOG: enP2p1s0 (igb): transmit queue 0 timed out
  Hardware name: Kontron SMARC-sAL28 (Single PHY) on SMARC Eval 2.0 carrier (DT)
  igb 0002:01:00.0 enP2p1s0: Reset adapter

Previously, pci_std_update_resource() wrote the assigned ROM address to the
BAR only when the ROM was enabled.  This meant that the I210 ROM BAR could
be left with an address assigned by firmware, which might overlap with
other BARs.

Quirk these I210 devices so pci_std_update_resource() always writes the
assigned address to the ROM BAR, whether or not the ROM is enabled.

Link: https://lore.kernel.org/r/20211223163754.GA1267351@bhelgaas
Link: https://lore.kernel.org/r/20201230185317.30915-1-michael@walle.cc
Link: https://bugzilla.kernel.org/show_bug.cgi?id=211105
Reported-by: Michael Walle <michael@walle.cc>
Tested-by: Michael Walle <michael@walle.cc>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 18a75c8e615c..51c4a063f489 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -455,6 +455,7 @@ struct pci_dev {
 	unsigned int	link_active_reporting:1;/* Device capable of reporting link active */
 	unsigned int	no_vf_scan:1;		/* Don't scan for VFs after IOV enablement */
 	unsigned int	no_command_memory:1;	/* No PCI_COMMAND_MEMORY */
+	unsigned int	rom_bar_overlap:1;	/* ROM BAR disable broken */
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
-- 
cgit v1.2.3


From 1d3cfc2835c1754d19a743dc346a9e58cf0c07c0 Mon Sep 17 00:00:00 2001
From: Kelvin Cao <kelvin.cao@microchip.com>
Date: Thu, 23 Dec 2021 17:23:33 -0800
Subject: ntb_hw_switchtec: Remove code for disabling ID protection

ID protection is a firmware setting for NT window access control. With
it enabled, only the posted requests with requester IDs in the requester
ID table will be allowed to access the NT windows. Otherwise all posted
requests are allowed. Normally user will configure it statically via the
Switchtec config file, and it will take effect when the firmware boots
up. The driver can also toggle the ID protection setting dynamically,
which will overwrite the static setting in the Switchtec config file as
a side effect.

Currently, the driver disables the ID protection. However, it's not
necessary to disable the ID protection at the driver level as the driver
has already configured the proper requester IDs in the requester ID
table to allow the corresponding posted requests to hit the NT windows.
Remove the code that disables the ID protection to make the static
setting prevail.

Note: ID protection is not applicable to non-posted requests.

Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/switchtec.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index be24056ac00f..48fabe36509e 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -337,8 +337,6 @@ enum {
 	NTB_CTRL_REQ_ID_EN = 1 << 0,
 
 	NTB_CTRL_LUT_EN = 1 << 0,
-
-	NTB_PART_CTRL_ID_PROT_DIS = 1 << 0,
 };
 
 struct ntb_ctrl_regs {
-- 
cgit v1.2.3


From ca321ec74322e3c49552fc1ffc80b42d0dbf1a84 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 8 Jan 2022 15:06:57 +0100
Subject: module.h: allow #define strings to work with MODULE_IMPORT_NS

The MODULE_IMPORT_NS() macro does not allow defined strings to work
properly with it, so add a layer of indirection to allow this to happen.

Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Matthias Maennich <maennich@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Matthias Maennich <maennich@google.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/module.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index c9f1200b2312..f4338235ed2c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -290,7 +290,8 @@ extern typeof(name) __mod_##type##__##name##_device_table		\
  * files require multiple MODULE_FIRMWARE() specifiers */
 #define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware)
 
-#define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, #ns)
+#define _MODULE_IMPORT_NS(ns)	MODULE_INFO(import_ns, #ns)
+#define MODULE_IMPORT_NS(ns)	_MODULE_IMPORT_NS(ns)
 
 struct notifier_block;
 
-- 
cgit v1.2.3


From 3f4b32511a77bc5a05cfbf26fec94c4e1b1cf46a Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Fri, 7 Jan 2022 18:17:18 +0000
Subject: PM: core: Remove DEFINE_UNIVERSAL_DEV_PM_OPS() macro

The deprecated UNIVERSAL_DEV_PM_OPS() macro uses the provided callbacks
for both runtime PM and system sleep, which is very likely to be a
mistake, as a system sleep can be triggered while a given device is
already PM-suspended, which would cause the suspend callback to be
called twice.

The amount of users of UNIVERSAL_DEV_PM_OPS() is also tiny (16
occurences) compared to the number of places where
SET_SYSTEM_SLEEP_PM_OPS() is used with pm_runtime_force_suspend() and
pm_runtime_force_resume(), which makes me think that none of these cases
are actually valid.

As the new macro DEFINE_UNIVERSAL_DEV_PM_OPS() which was introduced to
replace UNIVERSAL_DEV_PM_OPS() is currently unused, remove it before
someone starts to use it in yet another invalid case.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index e1e9402180b9..02f059d814bb 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -366,6 +366,12 @@ static const struct dev_pm_ops name = { \
 	SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
 }
 
+/* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */
+#define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
+const struct dev_pm_ops __maybe_unused name = { \
+	SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+}
+
 /*
  * Use this for defining a set of PM operations to be used in all situations
  * (system suspend, hibernation or runtime PM).
@@ -378,20 +384,9 @@ static const struct dev_pm_ops name = { \
  * suspend and "early" resume callback pointers, .suspend_late() and
  * .resume_early(), to the same routines as .runtime_suspend() and
  * .runtime_resume(), respectively (and analogously for hibernation).
+ *
+ * Deprecated. You most likely don't want this macro.
  */
-#define DEFINE_UNIVERSAL_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
-static const struct dev_pm_ops name = { \
-	SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
-	RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
-}
-
-/* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */
-#define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
-const struct dev_pm_ops __maybe_unused name = { \
-	SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
-}
-
-/* Deprecated. Use DEFINE_UNIVERSAL_DEV_PM_OPS() instead. */
 #define UNIVERSAL_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
 const struct dev_pm_ops __maybe_unused name = { \
 	SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
-- 
cgit v1.2.3


From 52cc1d7f9786d2be44a3ab9b5b48416a7618e713 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Fri, 7 Jan 2022 18:17:19 +0000
Subject: PM: core: Remove static qualifier in DEFINE_SIMPLE_DEV_PM_OPS macro

Keep this macro in line with the other ones. This makes it possible to
use them in the cases where the underlying dev_pm_ops structure is
exported.

Restore the "static" qualifier in the two drivers where the
DEFINE_SIMPLE_DEV_PM_OPS macro was used.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 02f059d814bb..8e13387e70ec 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -362,7 +362,7 @@ struct dev_pm_ops {
  * to RAM and hibernation.
  */
 #define DEFINE_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
-static const struct dev_pm_ops name = { \
+const struct dev_pm_ops name = { \
 	SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
 }
 
-- 
cgit v1.2.3


From 0ae101fdd3297b7165755340e05386f1e1379709 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Fri, 7 Jan 2022 18:17:20 +0000
Subject: PM: core: Add EXPORT[_GPL]_SIMPLE_DEV_PM_OPS macros

These macros are defined conditionally, according to CONFIG_PM:
- if CONFIG_PM is enabled, these macros resolve to
  DEFINE_SIMPLE_DEV_PM_OPS(), and the dev_pm_ops symbol will be
  exported.

- if CONFIG_PM is disabled, these macros will result in a dummy static
  dev_pm_ops to be created with the __maybe_unused flag. The dev_pm_ops
  will then be discarded by the compiler, along with the provided
  callback functions if they are not used anywhere else.

In the second case, the symbol is not exported, which should be
perfectly fine - users of the symbol should all use the pm_ptr() or
pm_sleep_ptr() macro, so the dev_pm_ops marked as "extern" in the
client's code will never be accessed.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 8e13387e70ec..8279af2c538a 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -8,6 +8,7 @@
 #ifndef _LINUX_PM_H
 #define _LINUX_PM_H
 
+#include <linux/export.h>
 #include <linux/list.h>
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
@@ -357,14 +358,42 @@ struct dev_pm_ops {
 #define SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn)
 #endif
 
+#define _DEFINE_DEV_PM_OPS(name, \
+			   suspend_fn, resume_fn, \
+			   runtime_suspend_fn, runtime_resume_fn, idle_fn) \
+const struct dev_pm_ops name = { \
+	SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+	RUNTIME_PM_OPS(runtime_suspend_fn, runtime_resume_fn, idle_fn) \
+}
+
+#ifdef CONFIG_PM
+#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
+			   runtime_resume_fn, idle_fn, sec) \
+	_DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
+			   runtime_resume_fn, idle_fn); \
+	_EXPORT_SYMBOL(name, sec)
+#else
+#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
+			   runtime_resume_fn, idle_fn, sec) \
+static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \
+					 resume_fn, runtime_suspend_fn, \
+					 runtime_resume_fn, idle_fn)
+#endif
+
 /*
  * Use this if you want to use the same suspend and resume callbacks for suspend
  * to RAM and hibernation.
+ *
+ * If the underlying dev_pm_ops struct symbol has to be exported, use
+ * EXPORT_SIMPLE_DEV_PM_OPS() or EXPORT_GPL_SIMPLE_DEV_PM_OPS() instead.
  */
 #define DEFINE_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
-const struct dev_pm_ops name = { \
-	SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
-}
+	_DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL)
+
+#define EXPORT_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
+	_EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "")
+#define EXPORT_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
+	_EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl")
 
 /* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */
 #define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
-- 
cgit v1.2.3


From 9d8619190031af0a314bee865262d8975473e4dd Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Fri, 7 Jan 2022 18:17:21 +0000
Subject: PM: runtime: Add DEFINE_RUNTIME_DEV_PM_OPS() macro

A lot of drivers create a dev_pm_ops struct with the system sleep
suspend/resume callbacks set to pm_runtime_force_suspend() and
pm_runtime_force_resume().

These drivers can now use the DEFINE_RUNTIME_DEV_PM_OPS() macro, which
will use pm_runtime_force_{suspend,resume}() as the system sleep
callbacks, while having the same dead code removal characteristic that
is already provided by DEFINE_SIMPLE_DEV_PM_OPS().

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h         |  3 ++-
 include/linux/pm_runtime.h | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 8279af2c538a..f7d2be686359 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -414,7 +414,8 @@ const struct dev_pm_ops __maybe_unused name = { \
  * .resume_early(), to the same routines as .runtime_suspend() and
  * .runtime_resume(), respectively (and analogously for hibernation).
  *
- * Deprecated. You most likely don't want this macro.
+ * Deprecated. You most likely don't want this macro. Use
+ * DEFINE_RUNTIME_DEV_PM_OPS() instead.
  */
 #define UNIVERSAL_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
 const struct dev_pm_ops __maybe_unused name = { \
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 016de5776b6d..4af454d29281 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -22,6 +22,20 @@
 					    usage_count */
 #define RPM_AUTO		0x08	/* Use autosuspend_delay */
 
+/*
+ * Use this for defining a set of PM operations to be used in all situations
+ * (system suspend, hibernation or runtime PM).
+ *
+ * Note that the behaviour differs from the deprecated UNIVERSAL_DEV_PM_OPS()
+ * macro, which uses the provided callbacks for both runtime PM and system
+ * sleep, while DEFINE_RUNTIME_DEV_PM_OPS() uses pm_runtime_force_suspend()
+ * and pm_runtime_force_resume() for its system sleep callbacks.
+ */
+#define DEFINE_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
+	_DEFINE_DEV_PM_OPS(name, pm_runtime_force_suspend, \
+			   pm_runtime_force_resume, suspend_fn, \
+			   resume_fn, idle_fn)
+
 #ifdef CONFIG_PM
 extern struct workqueue_struct *pm_wq;
 
-- 
cgit v1.2.3


From d59ff7d9d84b03d22c5107f794e28fc8e1fce3a6 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Fri, 7 Jan 2022 18:17:22 +0000
Subject: PM: runtime: Add EXPORT[_GPL]_RUNTIME_DEV_PM_OPS macros

Similar to EXPORT[_GPL]_SIMPLE_DEV_PM_OPS, but for users with runtime-PM
suspend/resume callbacks.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_runtime.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 4af454d29281..9f09601c465a 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -30,12 +30,22 @@
  * macro, which uses the provided callbacks for both runtime PM and system
  * sleep, while DEFINE_RUNTIME_DEV_PM_OPS() uses pm_runtime_force_suspend()
  * and pm_runtime_force_resume() for its system sleep callbacks.
+ *
+ * If the underlying dev_pm_ops struct symbol has to be exported, use
+ * EXPORT_RUNTIME_DEV_PM_OPS() or EXPORT_GPL_RUNTIME_DEV_PM_OPS() instead.
  */
 #define DEFINE_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
 	_DEFINE_DEV_PM_OPS(name, pm_runtime_force_suspend, \
 			   pm_runtime_force_resume, suspend_fn, \
 			   resume_fn, idle_fn)
 
+#define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
+	_EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \
+			   suspend_fn, resume_fn, idle_fn, "")
+#define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
+	_EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \
+			   suspend_fn, resume_fn, idle_fn, "_gpl")
+
 #ifdef CONFIG_PM
 extern struct workqueue_struct *pm_wq;
 
-- 
cgit v1.2.3


From 8a59bb93b7e3cca389af44781a429ac12ac49be6 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Thu, 9 Dec 2021 14:53:30 -0500
Subject: NFSv4 store server support for fs_location attribute

Define and store if server returns it supports fs_locations attribute
as a capability.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_fs_sb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index f24fc67af42d..8c08b356c8ca 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -289,5 +289,5 @@ struct nfs_server {
 #define NFS_CAP_COPY_NOTIFY	(1U << 27)
 #define NFS_CAP_XATTR		(1U << 28)
 #define NFS_CAP_READ_PLUS	(1U << 29)
-
+#define NFS_CAP_FS_LOCATIONS	(1U << 30)
 #endif
-- 
cgit v1.2.3


From 2d7c86a8f9cdce1408c4f3c69d94d007eff2f179 Mon Sep 17 00:00:00 2001
From: Venky Shankar <vshankar@redhat.com>
Date: Wed, 14 Jul 2021 15:35:50 +0530
Subject: libceph: generalize addr/ip parsing based on delimiter

... and remove hardcoded function name in ceph_parse_ips().

[ idryomov: delim parameter, drop CEPH_ADDR_PARSE_DEFAULT_DELIM ]

Signed-off-by: Venky Shankar <vshankar@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h   | 2 +-
 include/linux/ceph/messenger.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 409d8c29bc4f..c72285d8594e 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -301,7 +301,7 @@ struct fs_parameter;
 struct fc_log;
 struct ceph_options *ceph_alloc_options(void);
 int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt,
-		       struct fc_log *l);
+		       struct fc_log *l, char delim);
 int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 		     struct fc_log *l);
 int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 0e6e9ad3c3bf..ff99ce094cfa 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -532,7 +532,7 @@ extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
 extern int ceph_parse_ips(const char *c, const char *end,
 			  struct ceph_entity_addr *addr,
-			  int max_count, int *count);
+			  int max_count, int *count, char delim);
 
 extern int ceph_msgr_init(void);
 extern void ceph_msgr_exit(void);
-- 
cgit v1.2.3


From 4153c7fc937a2afa077dbdb9fe3189b9981f423c Mon Sep 17 00:00:00 2001
From: Venky Shankar <vshankar@redhat.com>
Date: Wed, 14 Jul 2021 15:35:51 +0530
Subject: libceph: rename parse_fsid() to ceph_parse_fsid() and export

... as it is too generic. also, use __func__ when logging
rather than hardcoding the function name.

Signed-off-by: Venky Shankar <vshankar@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index c72285d8594e..644f224eccf7 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -296,6 +296,7 @@ extern bool libceph_compatible(void *data);
 extern const char *ceph_msg_type_name(int type);
 extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
 extern void *ceph_kvmalloc(size_t size, gfp_t flags);
+extern int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid);
 
 struct fs_parameter;
 struct fc_log;
-- 
cgit v1.2.3


From 1976b2b31462151403c9fc110204fcc2a77bdfd1 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 12 Jan 2022 10:27:38 -0500
Subject: NFSv4.1 query for fs_location attr on a new file system

Query the server for other possible trunkable locations for a given
file system on a 4.1+ mount.

v2:
-- added missing static to nfs4_discover_trunking,
reported by the kernel test robot

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_xdr.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index ddff92396a3f..728cb0c1f0b6 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1797,6 +1797,7 @@ struct nfs_rpc_ops {
 	struct nfs_server *(*create_server)(struct fs_context *);
 	struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *,
 					   struct nfs_fattr *, rpc_authflavor_t);
+	int	(*discover_trunking)(struct nfs_server *, struct nfs_fh *);
 };
 
 /*
-- 
cgit v1.2.3


From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin@huawei.com>
Date: Thu, 13 Jan 2022 10:55:36 +0800
Subject: blk-mq: fix tag_get wait task can't be awakened

In case of shared tags, there might be more than one hctx which
allocates from the same tags, and each hctx is limited to allocate at
most:
        hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U);

tag idle detection is lazy, and may be delayed for 30sec, so there
could be just one real active hctx(queue) but all others are actually
idle and still accounted as active because of the lazy idle detection.
Then if wake_batch is > hctx_max_depth, driver tag allocation may wait
forever on this real active hctx.

Fix this by recalculating wake_batch when inc or dec active_queues.

Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps")
Suggested-by: Ming Lei <ming.lei@redhat.com>
Suggested-by: John Garry <john.garry@huawei.com>
Signed-off-by: Laibin Qiu <qiulaibin@huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index fc0357a6e19b..95df357ec009 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
 	sbitmap_free(&sbq->sb);
 }
 
+/**
+ * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
+ * @sbq: Bitmap queue to recalculate wake batch.
+ * @users: Number of shares.
+ *
+ * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
+ * by depth. This interface is for HCTX shared tags or queue shared tags.
+ */
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+					    unsigned int users);
+
 /**
  * sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
  * @sbq: Bitmap queue to resize.
-- 
cgit v1.2.3


From 289e7b0f7eb47b87a0441e6c81336316f301eb39 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 13 Dec 2021 11:08:53 +0100
Subject: tracing: Account bottom half disabled sections.

Disabling only bottom halves via local_bh_disable() disables also
preemption but this remains invisible to tracing. On a CONFIG_PREEMPT
kernel one might wonder why there is no scheduling happening despite the
N flag in the trace. The reason might be the a rcu_read_lock_bh()
section.

Add a 'b' to the tracing output if in task context with disabled bottom
halves.

Link: https://lkml.kernel.org/r/YbcbtdtC/bjCKo57@linutronix.de

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 3900404aa063..70c069aef02c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -172,6 +172,7 @@ enum trace_flag_type {
 	TRACE_FLAG_SOFTIRQ		= 0x10,
 	TRACE_FLAG_PREEMPT_RESCHED	= 0x20,
 	TRACE_FLAG_NMI			= 0x40,
+	TRACE_FLAG_BH_OFF		= 0x80,
 };
 
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
-- 
cgit v1.2.3


From 6840f9094f2bd788a316d8cb0a4e42538d3e47dd Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 13 Jan 2022 16:44:19 -0500
Subject: pagevec: Initialise folio_batch->percpu_pvec_drained

When UBSAN is enabled, it reports an invalid value in __pagevec_release()
when accessing pvec->percpu_pvec_drained, which is simply whatever
garbage was on the stack.  Initialise it when initialising the rest of
the folio_batch.

Fixes: 10331795fb79 ("pagevec: Add folio_batch")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/pagevec.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index dda8d5868c81..67b1246f136b 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -111,6 +111,7 @@ static_assert(offsetof(struct pagevec, pages) ==
 static inline void folio_batch_init(struct folio_batch *fbatch)
 {
 	fbatch->nr = 0;
+	fbatch->percpu_pvec_drained = false;
 }
 
 static inline unsigned int folio_batch_count(struct folio_batch *fbatch)
-- 
cgit v1.2.3


From e5b48ee30aec1fe6dff05e36b22e886c665b4736 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Tue, 4 Jan 2022 16:14:46 +0900
Subject: ata: sata_fsl: fix scsi host initialization

When compiling with W=1, the sata_fsl driver compilation throws the
warning:

drivers/ata/sata_fsl.c:1385:22: error: initialized field overwritten
[-Werror=override-init]
 1385 |         .can_queue = SATA_FSL_QUEUE_DEPTH,

This is due to the driver scsi host template initialization overwriting
the can_queue field that is already set using the ATA_NCQ_SHT()
initializer macro, resulting in the same field being initialized twice
in the host template declaration.

To remove this warning, introduce the ATA_SUBBASE_SHT_QD() and
ATA_NCQ_SHT_QD() initialization macros to allow specifying a queue depth
different from the default ATA_DEF_QUEUE using an additional argument to
the macro.

Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
---
 include/linux/libata.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index c258f69106f4..2e5e7c40c991 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1385,6 +1385,12 @@ extern const struct attribute_group *ata_common_sdev_groups[];
 	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,		\
 	.slave_configure	= ata_scsi_slave_config
 
+#define ATA_SUBBASE_SHT_QD(drv_name, drv_qd)			\
+	__ATA_BASE_SHT(drv_name),				\
+	.can_queue		= drv_qd,			\
+	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,		\
+	.slave_configure	= ata_scsi_slave_config
+
 #define ATA_BASE_SHT(drv_name)					\
 	ATA_SUBBASE_SHT(drv_name),				\
 	.sdev_groups		= ata_common_sdev_groups
@@ -1396,6 +1402,11 @@ extern const struct attribute_group *ata_ncq_sdev_groups[];
 	ATA_SUBBASE_SHT(drv_name),				\
 	.sdev_groups		= ata_ncq_sdev_groups,		\
 	.change_queue_depth	= ata_scsi_change_queue_depth
+
+#define ATA_NCQ_SHT_QD(drv_name, drv_qd)			\
+	ATA_SUBBASE_SHT_QD(drv_name, drv_qd),			\
+	.sdev_groups		= ata_ncq_sdev_groups,		\
+	.change_queue_depth	= ata_scsi_change_queue_depth
 #endif
 
 /*
-- 
cgit v1.2.3


From 0561e514c944da874ccdfbe2922f71b4c333c7e1 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Tue, 4 Jan 2022 17:54:18 +0900
Subject: ata: fix read_id() ata port operation interface

Drivers that need to tweak a device IDENTIFY data implement the
read_id() port operation. The IDENTIFY data buffer is passed as an
argument to the read_id() operation for drivers to use. However, when
this operation is called, the IDENTIFY data is not yet converted to CPU
endian and contains le16 words.

Change the interface of the read_id operation to pass a __le16 * pointer
to the IDENTIFY data buffer to clarify the buffer endianness. Fix the
pata_netcell, pata_it821x, ahci_xgene, ahci_ceva and ahci_brcm drivers
implementation of this operation and modify the code to corretly deal
with identify data words manipulation to avoid sparse warnings such as:

drivers/ata/ahci_xgene.c:262:33: warning: invalid assignment: &=
drivers/ata/ahci_xgene.c:262:33:    left side has type unsigned short
drivers/ata/ahci_xgene.c:262:33:    right side has type restricted __le16

Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
---
 include/linux/libata.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2e5e7c40c991..bf706cd45674 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -884,7 +884,8 @@ struct ata_port_operations {
 	void (*set_piomode)(struct ata_port *ap, struct ata_device *dev);
 	void (*set_dmamode)(struct ata_port *ap, struct ata_device *dev);
 	int  (*set_mode)(struct ata_link *link, struct ata_device **r_failed_dev);
-	unsigned int (*read_id)(struct ata_device *dev, struct ata_taskfile *tf, u16 *id);
+	unsigned int (*read_id)(struct ata_device *dev, struct ata_taskfile *tf,
+				__le16 *id);
 
 	void (*dev_config)(struct ata_device *dev);
 
@@ -1119,7 +1120,7 @@ extern void ata_id_string(const u16 *id, unsigned char *s,
 extern void ata_id_c_string(const u16 *id, unsigned char *s,
 			    unsigned int ofs, unsigned int len);
 extern unsigned int ata_do_dev_read_id(struct ata_device *dev,
-					struct ata_taskfile *tf, u16 *id);
+				       struct ata_taskfile *tf, __le16 *id);
 extern void ata_qc_complete(struct ata_queued_cmd *qc);
 extern u64 ata_qc_get_active(struct ata_port *ap);
 extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd);
-- 
cgit v1.2.3


From b9ba367c513dbc165dd6c01266a59db4be2a3564 Mon Sep 17 00:00:00 2001
From: Paul Menzel <pmenzel@molgen.mpg.de>
Date: Wed, 5 Jan 2022 16:36:16 +0100
Subject: ata: libata: Rename link flag ATA_LFLAG_NO_DB_DELAY

Rename the link flag ATA_LFLAG_NO_DB_DELAY to
ATA_LFLAG_NO_DEBOUNCE_DELAY. The new name is longer, but clearer.

Signed-off-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index bf706cd45674..605756f645be 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -143,7 +143,7 @@ enum {
 	ATA_LFLAG_NO_LPM	= (1 << 8), /* disable LPM on this link */
 	ATA_LFLAG_RST_ONCE	= (1 << 9), /* limit recovery to one reset */
 	ATA_LFLAG_CHANGED	= (1 << 10), /* LPM state changed on this link */
-	ATA_LFLAG_NO_DB_DELAY	= (1 << 11), /* no debounce delay on link resume */
+	ATA_LFLAG_NO_DEBOUNCE_DELAY = (1 << 11), /* no debounce delay on link resume */
 
 	/* struct ata_port flags */
 	ATA_FLAG_SLAVE_POSS	= (1 << 0), /* host supports slave dev */
-- 
cgit v1.2.3


From d9679d0013a66849f23057978f92e76b255c50aa Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 13 Oct 2021 06:55:44 -0400
Subject: virtio: wrap config->reset calls

This will enable cleanups down the road.
The idea is to disable cbs, then add "flush_queued_cbs" callback
as a parameter, this way drivers can flush any work
queued after callbacks have been disabled.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20211013105226.20225-1-mst@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 41edbc01ffa4..72292a62cd90 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -138,6 +138,7 @@ int virtio_finalize_features(struct virtio_device *dev);
 int virtio_device_freeze(struct virtio_device *dev);
 int virtio_device_restore(struct virtio_device *dev);
 #endif
+void virtio_reset_device(struct virtio_device *dev);
 
 size_t virtio_max_dma_size(struct virtio_device *vdev);
 
-- 
cgit v1.2.3


From 539fec78edb4e084e7c532affc56cc42d4ceea4b Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Fri, 26 Nov 2021 17:47:53 +0100
Subject: vdpa: add driver_override support

`driver_override` allows to control which of the vDPA bus drivers
binds to a vDPA device.

If `driver_override` is not set, the previous behaviour is followed:
devices use the first vDPA bus driver loaded (unless auto binding
is disabled).

Tested on Fedora 34 with driverctl(8):
  $ modprobe virtio-vdpa
  $ modprobe vhost-vdpa
  $ modprobe vdpa-sim-net

  $ vdpa dev add mgmtdev vdpasim_net name dev1

  # dev1 is attached to the first vDPA bus driver loaded
  $ driverctl -b vdpa list-devices
    dev1 virtio_vdpa

  $ driverctl -b vdpa set-override dev1 vhost_vdpa

  $ driverctl -b vdpa list-devices
    dev1 vhost_vdpa [*]

  Note: driverctl(8) integrates with udev so the binding is
  preserved.

Suggested-by: Jason Wang <jasowang@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20211126164753.181829-3-sgarzare@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index c3011ccda430..ae34015b37b7 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -64,6 +64,7 @@ struct vdpa_mgmt_dev;
  * struct vdpa_device - representation of a vDPA device
  * @dev: underlying device
  * @dma_dev: the actual device that is performing DMA
+ * @driver_override: driver name to force a match
  * @config: the configuration ops for this device.
  * @cf_mutex: Protects get and set access to configuration layout.
  * @index: device index
@@ -76,6 +77,7 @@ struct vdpa_mgmt_dev;
 struct vdpa_device {
 	struct device dev;
 	struct device *dma_dev;
+	const char *driver_override;
 	const struct vdpa_config_ops *config;
 	struct mutex cf_mutex; /* Protects get/set config */
 	unsigned int index;
-- 
cgit v1.2.3


From 28cc408be72cebb0f3fcc37bc74ab3196d4de726 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= <eperezma@redhat.com>
Date: Thu, 4 Nov 2021 20:52:48 +0100
Subject: vdpa: Mark vdpa_config_ops.get_vq_notification as optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since vhost_vdpa_mmap checks for its existence before calling it.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Link: https://lore.kernel.org/r/20211104195248.2088904-1-eperezma@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
---
 include/linux/vdpa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index ae34015b37b7..2b7db96bb7d3 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -157,7 +157,7 @@ struct vdpa_map_file {
  *				@vdev: vdpa device
  *				@idx: virtqueue index
  *				@state: pointer to returned state (last_avail_idx)
- * @get_vq_notification:	Get the notification area for a virtqueue
+ * @get_vq_notification:	Get the notification area for a virtqueue (optional)
  *				@vdev: vdpa device
  *				@idx: virtqueue index
  *				Returns the notifcation area
-- 
cgit v1.2.3


From a64917bc2e9b1e0aa716b783c4ec879fdd280300 Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Wed, 5 Jan 2022 13:46:33 +0200
Subject: vdpa: Provide interface to read driver features

Provide an interface to read the negotiated features. This is needed
when building the netlink message in vdpa_dev_net_config_fill().

Also fix the implementation of vdpa_dev_net_config_fill() to use the
negotiated features instead of the device features.

To make APIs clearer, make the following name changes to struct
vdpa_config_ops so they better describe their operations:

get_features -> get_device_features
set_features -> set_driver_features

Finally, add get_driver_features to return the negotiated features and
add implementation to all the upstream drivers.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20220105114646.577224-2-elic@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 2b7db96bb7d3..9cc4291a79b3 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -171,14 +171,17 @@ struct vdpa_map_file {
  *				for the device
  *				@vdev: vdpa device
  *				Returns virtqueue algin requirement
- * @get_features:		Get virtio features supported by the device
+ * @get_device_features:	Get virtio features supported by the device
  *				@vdev: vdpa device
  *				Returns the virtio features support by the
  *				device
- * @set_features:		Set virtio features supported by the driver
+ * @set_driver_features:	Set virtio features supported by the driver
  *				@vdev: vdpa device
  *				@features: feature support by the driver
  *				Returns integer: success (0) or error (< 0)
+ * @get_driver_features:	Get the virtio driver features in action
+ *				@vdev: vdpa device
+ *				Returns the virtio features accepted
  * @set_config_cb:		Set the config interrupt callback
  *				@vdev: vdpa device
  *				@cb: virtio-vdev interrupt callback structure
@@ -278,8 +281,9 @@ struct vdpa_config_ops {
 
 	/* Device ops */
 	u32 (*get_vq_align)(struct vdpa_device *vdev);
-	u64 (*get_features)(struct vdpa_device *vdev);
-	int (*set_features)(struct vdpa_device *vdev, u64 features);
+	u64 (*get_device_features)(struct vdpa_device *vdev);
+	int (*set_driver_features)(struct vdpa_device *vdev, u64 features);
+	u64 (*get_driver_features)(struct vdpa_device *vdev);
 	void (*set_config_cb)(struct vdpa_device *vdev,
 			      struct vdpa_callback *cb);
 	u16 (*get_vq_num_max)(struct vdpa_device *vdev);
@@ -397,7 +401,7 @@ static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features)
 	const struct vdpa_config_ops *ops = vdev->config;
 
 	vdev->features_valid = true;
-	return ops->set_features(vdev, features);
+	return ops->set_driver_features(vdev, features);
 }
 
 void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
-- 
cgit v1.2.3


From 73bc0dbb591baea322a7319c735e5f6c7dba9cfb Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Wed, 5 Jan 2022 13:46:35 +0200
Subject: vdpa: Sync calls set/get config/status with cf_mutex

Add wrappers to get/set status and protect these operations with
cf_mutex to serialize these operations with respect to get/set config
operations.

Signed-off-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20220105114646.577224-4-elic@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 9cc4291a79b3..ae047fae2603 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -408,6 +408,9 @@ void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
 		     void *buf, unsigned int len);
 void vdpa_set_config(struct vdpa_device *dev, unsigned int offset,
 		     const void *buf, unsigned int length);
+u8 vdpa_get_status(struct vdpa_device *vdev);
+void vdpa_set_status(struct vdpa_device *vdev, u8 status);
+
 /**
  * struct vdpa_mgmtdev_ops - vdpa device ops
  * @dev_add: Add a vdpa device using alloc and register
-- 
cgit v1.2.3


From aba21aff772b8622e08f07219069be793429a48f Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Wed, 5 Jan 2022 13:46:37 +0200
Subject: vdpa: Allow to configure max data virtqueues

Add netlink support to configure the max virtqueue pairs for a device.
At least one pair is required. The maximum is dictated by the device.

Example:
$ vdpa dev add name vdpa-a mgmtdev auxiliary/mlx5_core.sf.1 max_vqp 4

Signed-off-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20220105114646.577224-6-elic@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index ae047fae2603..6d4d7e4fe208 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -101,6 +101,7 @@ struct vdpa_dev_set_config {
 	struct {
 		u8 mac[ETH_ALEN];
 		u16 mtu;
+		u16 max_vq_pairs;
 	} net;
 	u64 mask;
 };
@@ -391,17 +392,29 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev)
 static inline int vdpa_reset(struct vdpa_device *vdev)
 {
 	const struct vdpa_config_ops *ops = vdev->config;
+	int ret;
 
+	mutex_lock(&vdev->cf_mutex);
 	vdev->features_valid = false;
-	return ops->reset(vdev);
+	ret = ops->reset(vdev);
+	mutex_unlock(&vdev->cf_mutex);
+	return ret;
 }
 
-static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features)
+static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features, bool locked)
 {
 	const struct vdpa_config_ops *ops = vdev->config;
+	int ret;
+
+	if (!locked)
+		mutex_lock(&vdev->cf_mutex);
 
 	vdev->features_valid = true;
-	return ops->set_driver_features(vdev, features);
+	ret = ops->set_driver_features(vdev, features);
+	if (!locked)
+		mutex_unlock(&vdev->cf_mutex);
+
+	return ret;
 }
 
 void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
-- 
cgit v1.2.3


From cd2629f6df1cab5b3df34705ae7f3bde6147fce3 Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Wed, 5 Jan 2022 13:46:42 +0200
Subject: vdpa: Support reporting max device capabilities

Add max_supported_vqs and supported_features fields to struct
vdpa_mgmt_dev. Upstream drivers need to feel these values according to
the device capabilities.

These values are reported back in a netlink message when showing management
devices.

Examples:

$ auxiliary/mlx5_core.sf.1:
  supported_classes net
  max_supported_vqs 257
  dev_features CSUM GUEST_CSUM MTU HOST_TSO4 HOST_TSO6 STATUS CTRL_VQ MQ \
               CTRL_MAC_ADDR VERSION_1 ACCESS_PLATFORM

$ vdpa -j mgmtdev show
{"mgmtdev":{"auxiliary/mlx5_core.sf.1":{"supported_classes":["net"], \
  "max_supported_vqs":257,"dev_features":["CSUM","GUEST_CSUM","MTU", \
  "HOST_TSO4","HOST_TSO6","STATUS","CTRL_VQ","MQ","CTRL_MAC_ADDR", \
  "VERSION_1","ACCESS_PLATFORM"]}}}

$ vdpa -jp mgmtdev show
{
    "mgmtdev": {
        "auxiliary/mlx5_core.sf.1": {
            "supported_classes": [ "net" ],
            "max_supported_vqs": 257,
            "dev_features": ["CSUM","GUEST_CSUM","MTU","HOST_TSO4", \
                             "HOST_TSO6","STATUS","CTRL_VQ","MQ", \
                             "CTRL_MAC_ADDR","VERSION_1","ACCESS_PLATFORM"]
        }
    }
}

Signed-off-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20220105114646.577224-11-elic@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Si-Wei Liu<si-wei.liu@oracle.com>
---
 include/linux/vdpa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 6d4d7e4fe208..a6047fd6cf12 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -460,6 +460,8 @@ struct vdpa_mgmt_dev {
 	const struct virtio_device_id *id_table;
 	u64 config_attr_mask;
 	struct list_head list;
+	u64 supported_features;
+	u32 max_supported_vqs;
 };
 
 int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev);
-- 
cgit v1.2.3


From f6d955d80830b6e6f6a170be68cc3628f36365dd Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Tue, 11 Jan 2022 20:33:57 +0200
Subject: vdpa: Avoid taking cf_mutex lock on get status

Avoid the wrapper holding cf_mutex since it is not protecting anything.
To avoid confusion and unnecessary overhead incurred by it, remove.

Fixes: f489f27bc0ab ("vdpa: Sync calls set/get config/status with cf_mutex")
Signed-off-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20220111183400.38418-2-elic@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Si-Wei Liu<si-wei.liu@oracle.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/vdpa.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index a6047fd6cf12..2de442ececae 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -421,7 +421,6 @@ void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
 		     void *buf, unsigned int len);
 void vdpa_set_config(struct vdpa_device *dev, unsigned int offset,
 		     const void *buf, unsigned int length);
-u8 vdpa_get_status(struct vdpa_device *vdev);
 void vdpa_set_status(struct vdpa_device *vdev, u8 status);
 
 /**
-- 
cgit v1.2.3


From 800977f6f32e452cba6b04ef21d2f5383ca29209 Mon Sep 17 00:00:00 2001
From: Cai Huoqing <caihuoqing@baidu.com>
Date: Fri, 14 Jan 2022 14:02:52 -0800
Subject: kthread: add the helper function kthread_run_on_cpu()

Add a new helper function kthread_run_on_cpu(), which includes
kthread_create_on_cpu/wake_up_process().

In some cases, use kthread_run_on_cpu() directly instead of
kthread_create_on_node/kthread_bind/wake_up_process() or
kthread_create_on_cpu/wake_up_process() or
kthreadd_create/kthread_bind/wake_up_process() to simplify the code.

[akpm@linux-foundation.org: export kthread_create_on_cpu to modules]

Link: https://lkml.kernel.org/r/20211022025711.3673-2-caihuoqing@baidu.com
Signed-off-by: Cai Huoqing <caihuoqing@baidu.com>
Cc: Bernard Metzler <bmt@zurich.ibm.com>
Cc: Cai Huoqing <caihuoqing@baidu.com>
Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kthread.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 346b0f269161..db47aae7c481 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -56,6 +56,31 @@ bool kthread_is_per_cpu(struct task_struct *k);
 	__k;								   \
 })
 
+/**
+ * kthread_run_on_cpu - create and wake a cpu bound thread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @cpu: The cpu on which the thread should be bound,
+ * @namefmt: printf-style name for the thread. Format is restricted
+ *	     to "name.*%u". Code fills in cpu number.
+ *
+ * Description: Convenient wrapper for kthread_create_on_cpu()
+ * followed by wake_up_process().  Returns the kthread or
+ * ERR_PTR(-ENOMEM).
+ */
+static inline struct task_struct *
+kthread_run_on_cpu(int (*threadfn)(void *data), void *data,
+			unsigned int cpu, const char *namefmt)
+{
+	struct task_struct *p;
+
+	p = kthread_create_on_cpu(threadfn, data, cpu, namefmt);
+	if (!IS_ERR(p))
+		wake_up_process(p);
+
+	return p;
+}
+
 void free_kthread_struct(struct task_struct *k);
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
-- 
cgit v1.2.3


From 60115fa54ad7b913b7cb5844e6b7ffeb842d55f2 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 14 Jan 2022 14:04:11 -0800
Subject: mm: defer kmemleak object creation of module_alloc()

Yongqiang reports a kmemleak panic when module insmod/rmmod with KASAN
enabled(without KASAN_VMALLOC) on x86[1].

When the module area allocates memory, it's kmemleak_object is created
successfully, but the KASAN shadow memory of module allocation is not
ready, so when kmemleak scan the module's pointer, it will panic due to
no shadow memory with KASAN check.

  module_alloc
    __vmalloc_node_range
      kmemleak_vmalloc
				kmemleak_scan
				  update_checksum
    kasan_module_alloc
      kmemleak_ignore

Note, there is no problem if KASAN_VMALLOC enabled, the modules area
entire shadow memory is preallocated.  Thus, the bug only exits on ARCH
which supports dynamic allocation of module area per module load, for
now, only x86/arm64/s390 are involved.

Add a VM_DEFER_KMEMLEAK flags, defer vmalloc'ed object register of
kmemleak in module_alloc() to fix this issue.

[1] https://lore.kernel.org/all/6d41e2b9-4692-5ec4-b1cd-cbe29ae89739@huawei.com/

[wangkefeng.wang@huawei.com: fix build]
  Link: https://lkml.kernel.org/r/20211125080307.27225-1-wangkefeng.wang@huawei.com
[akpm@linux-foundation.org: simplify ifdefs, per Andrey]
  Link: https://lkml.kernel.org/r/CA+fCnZcnwJHUQq34VuRxpdoY6_XbJCDJ-jopksS5Eia4PijPzw@mail.gmail.com

Link: https://lkml.kernel.org/r/20211124142034.192078-1-wangkefeng.wang@huawei.com
Fixes: 793213a82de4 ("s390/kasan: dynamic shadow mem allocation for modules")
Fixes: 39d114ddc682 ("arm64: add KASAN support")
Fixes: bebf56a1b176 ("kasan: enable instrumentation of global variables")
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reported-by: Yongqiang Liu <liuyongqiang13@huawei.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h   | 4 ++--
 include/linux/vmalloc.h | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d8783b682669..89c99e5e67de 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -474,12 +474,12 @@ static inline void kasan_populate_early_vm_area_shadow(void *start,
  * allocations with real shadow memory. With KASAN vmalloc, the special
  * case is unnecessary, as the work is handled in the generic case.
  */
-int kasan_module_alloc(void *addr, size_t size);
+int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask);
 void kasan_free_shadow(const struct vm_struct *vm);
 
 #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
 
-static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
 
 #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6e022cc712e6..880227b9f044 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -28,6 +28,13 @@ struct notifier_block;		/* in notifier.h */
 #define VM_MAP_PUT_PAGES	0x00000200	/* put pages and free array in vfree */
 #define VM_NO_HUGE_VMAP		0x00000400	/* force PAGE_SIZE pte mapping */
 
+#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
+	!defined(CONFIG_KASAN_VMALLOC)
+#define VM_DEFER_KMEMLEAK	0x00000800	/* defer kmemleak object creation */
+#else
+#define VM_DEFER_KMEMLEAK	0
+#endif
+
 /*
  * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC.
  *
-- 
cgit v1.2.3


From c4386bd8ee3a921c3c799b7197dc898ade76a453 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 14 Jan 2022 14:04:22 -0800
Subject: mm/memremap: add ZONE_DEVICE support for compound pages

Add a new @vmemmap_shift property for struct dev_pagemap which specifies
that a devmap is composed of a set of compound pages of order
@vmemmap_shift, instead of base pages.  When a compound page devmap is
requested, all but the first page are initialised as tail pages instead
of order-0 pages.

For certain ZONE_DEVICE users like device-dax which have a fixed page
size, this creates an opportunity to optimize GUP and GUP-fast walkers,
treating it the same way as THP or hugetlb pages.

Additionally, commit 7118fc2906e2 ("hugetlb: address ref count racing in
prep_compound_gigantic_page") removed set_page_count() because the
setting of page ref count to zero was redundant.  devmap pages don't
come from page allocator though and only head page refcount is used for
compound pages, hence initialize tail page count to zero.

Link: https://lkml.kernel.org/r/20211202204422.26777-5-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memremap.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c0e9d35889e8..61a6a0e27359 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -99,6 +99,11 @@ struct dev_pagemap_ops {
  * @done: completion for @internal_ref
  * @type: memory type: see MEMORY_* in memory_hotplug.h
  * @flags: PGMAP_* flags to specify defailed behavior
+ * @vmemmap_shift: structural definition of how the vmemmap page metadata
+ *      is populated, specifically the metadata page order.
+ *	A zero value (default) uses base pages as the vmemmap metadata
+ *	representation. A bigger value will set up compound struct pages
+ *	of the requested order value.
  * @ops: method table
  * @owner: an opaque pointer identifying the entity that manages this
  *	instance.  Used by various helpers to make sure that no
@@ -114,6 +119,7 @@ struct dev_pagemap {
 	struct completion done;
 	enum memory_type type;
 	unsigned int flags;
+	unsigned long vmemmap_shift;
 	const struct dev_pagemap_ops *ops;
 	void *owner;
 	int nr_range;
@@ -130,6 +136,11 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
 	return NULL;
 }
 
+static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
+{
+	return 1 << pgmap->vmemmap_shift;
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
-- 
cgit v1.2.3


From 3e9d80a891df3b1a5d77db47fa7fdf33ba71e5cb Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 14 Jan 2022 14:05:04 -0800
Subject: mm,fs: split dump_mapping() out from dump_page()

dump_mapping() is a big chunk of dump_page(), and it'd be handy to be
able to call it when we don't have a struct page.  Split it out and move
it to fs/inode.c.  Take the opportunity to simplify some of the debug
messages a little.

Link: https://lkml.kernel.org/r/20211121121056.2870061-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbf812ce89a8..5315fa68f751 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3152,6 +3152,7 @@ extern void unlock_new_inode(struct inode *);
 extern void discard_new_inode(struct inode *);
 extern unsigned int get_next_ino(void);
 extern void evict_inodes(struct super_block *sb);
+void dump_mapping(const struct address_space *);
 
 /*
  * Userspace may rely on the the inode number being non-zero. For example, glibc
-- 
cgit v1.2.3


From b6bf9abb0aa44e53ffe9c1e6e1d32568f5b25e4a Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <schatzberg.dan@gmail.com>
Date: Fri, 14 Jan 2022 14:05:35 -0800
Subject: mm/memcg: add oom_group_kill memory event

Our container agent wants to know when a container exits if it was OOM
killed or not to report to the user.  We use memory.oom.group = 1 to
ensure that OOM kills within the container's cgroup kill everything.
Existing memory.events are insufficient for knowing if this triggered:

1) Our current approach reads memory.events oom_kill and reports the
   container was killed if the value is non-zero. This is erroneous in
   some cases where containers create their children cgroups with
   memory.oom.group=1 as such OOM kills will get counted against the
   container cgroup's oom_kill counter despite not actually OOM killing
   the entire container.

2) Reading memory.events.local will fail to identify OOM kills in leaf
   cgroups (that don't set memory.oom.group) within the container
   cgroup.

This patch adds a new oom_group_kill event when memory.oom.group
triggers to allow userspace to cleanly identify when an entire cgroup is
oom killed.

[schatzberg.dan@gmail.com: changes from Johannes and Chris]
  Link: https://lkml.kernel.org/r/20211213162511.2492267-1-schatzberg.dan@gmail.com

Link: https://lkml.kernel.org/r/20211203162426.3375036-1-schatzberg.dan@gmail.com
Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c5c403f4be6..951f24f42147 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -42,6 +42,7 @@ enum memcg_memory_event {
 	MEMCG_MAX,
 	MEMCG_OOM,
 	MEMCG_OOM_KILL,
+	MEMCG_OOM_GROUP_KILL,
 	MEMCG_SWAP_HIGH,
 	MEMCG_SWAP_MAX,
 	MEMCG_SWAP_FAIL,
-- 
cgit v1.2.3


From 4e5aa1f4c2b489bc6f3ab5ca54747b18a847289d Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Fri, 14 Jan 2022 14:05:45 -0800
Subject: memcg: add per-memcg vmalloc stat

The kvmalloc* allocation functions can fallback to vmalloc allocations
and more often on long running machines.  In addition the kernel does
have __GFP_ACCOUNT kvmalloc* calls.  So, often on long running machines,
the memory.stat does not tell the complete picture which type of memory
is charged to the memcg.  So add a per-memcg vmalloc stat.

[shakeelb@google.com: page_memcg() within rcu lock, per Muchun]
  Link: https://lkml.kernel.org/r/20211222052457.1960701-1-shakeelb@google.com
[akpm@linux-foundation.org: remove cast, per Muchun]
[shakeelb@google.com: remove area->page[0] checks and move to page by page accounting per Michal]
  Link: https://lkml.kernel.org/r/20220104222341.3972772-1-shakeelb@google.com

Link: https://lkml.kernel.org/r/20211221215336.1922823-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 951f24f42147..0131e5574c88 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -33,6 +33,7 @@ enum memcg_stat_item {
 	MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
 	MEMCG_SOCK,
 	MEMCG_PERCPU_B,
+	MEMCG_VMALLOC,
 	MEMCG_NR_STAT,
 };
 
@@ -992,6 +993,21 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 	local_irq_restore(flags);
 }
 
+static inline void mod_memcg_page_state(struct page *page,
+					int idx, int val)
+{
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	rcu_read_lock();
+	memcg = page_memcg(page);
+	if (memcg)
+		mod_memcg_state(memcg, idx, val);
+	rcu_read_unlock();
+}
+
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 {
 	return READ_ONCE(memcg->vmstats.state[idx]);
@@ -1447,6 +1463,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 {
 }
 
+static inline void mod_memcg_page_state(struct page *page,
+					int idx, int val)
+{
+}
+
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 {
 	return 0;
-- 
cgit v1.2.3


From 9a10064f5625d5572c3626c1516e0bebc6c9fe9b Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@google.com>
Date: Fri, 14 Jan 2022 14:05:59 -0800
Subject: mm: add a field to store names for private anonymous memory

In many userspace applications, and especially in VM based applications
like Android uses heavily, there are multiple different allocators in
use.  At a minimum there is libc malloc and the stack, and in many cases
there are libc malloc, the stack, direct syscalls to mmap anonymous
memory, and multiple VM heaps (one for small objects, one for big
objects, etc.).  Each of these layers usually has its own tools to
inspect its usage; malloc by compiling a debug version, the VM through
heap inspection tools, and for direct syscalls there is usually no way
to track them.

On Android we heavily use a set of tools that use an extended version of
the logic covered in Documentation/vm/pagemap.txt to walk all pages
mapped in userspace and slice their usage by process, shared (COW) vs.
unique mappings, backing, etc.  This can account for real physical
memory usage even in cases like fork without exec (which Android uses
heavily to share as many private COW pages as possible between
processes), Kernel SamePage Merging, and clean zero pages.  It produces
a measurement of the pages that only exist in that process (USS, for
unique), and a measurement of the physical memory usage of that process
with the cost of shared pages being evenly split between processes that
share them (PSS).

If all anonymous memory is indistinguishable then figuring out the real
physical memory usage (PSS) of each heap requires either a pagemap
walking tool that can understand the heap debugging of every layer, or
for every layer's heap debugging tools to implement the pagemap walking
logic, in which case it is hard to get a consistent view of memory
across the whole system.

Tracking the information in userspace leads to all sorts of problems.
It either needs to be stored inside the process, which means every
process has to have an API to export its current heap information upon
request, or it has to be stored externally in a filesystem that somebody
needs to clean up on crashes.  It needs to be readable while the process
is still running, so it has to have some sort of synchronization with
every layer of userspace.  Efficiently tracking the ranges requires
reimplementing something like the kernel vma trees, and linking to it
from every layer of userspace.  It requires more memory, more syscalls,
more runtime cost, and more complexity to separately track regions that
the kernel is already tracking.

This patch adds a field to /proc/pid/maps and /proc/pid/smaps to show a
userspace-provided name for anonymous vmas.  The names of named
anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps as
[anon:<name>].

Userspace can set the name for a region of memory by calling

   prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name)

Setting the name to NULL clears it.  The name length limit is 80 bytes
including NUL-terminator and is checked to contain only printable ascii
characters (including space), except '[',']','\','$' and '`'.

Ascii strings are being used to have a descriptive identifiers for vmas,
which can be understood by the users reading /proc/pid/maps or
/proc/pid/smaps.  Names can be standardized for a given system and they
can include some variable parts such as the name of the allocator or a
library, tid of the thread using it, etc.

The name is stored in a pointer in the shared union in vm_area_struct
that points to a null terminated string.  Anonymous vmas with the same
name (equivalent strings) and are otherwise mergeable will be merged.
The name pointers are not shared between vmas even if they contain the
same name.  The name pointer is stored in a union with fields that are
only used on file-backed mappings, so it does not increase memory usage.

CONFIG_ANON_VMA_NAME kernel configuration is introduced to enable this
feature.  It keeps the feature disabled by default to prevent any
additional memory overhead and to avoid confusing procfs parsers on
systems which are not ready to support named anonymous vmas.

The patch is based on the original patch developed by Colin Cross, more
specifically on its latest version [1] posted upstream by Sumit Semwal.
It used a userspace pointer to store vma names.  In that design, name
pointers could be shared between vmas.  However during the last
upstreaming attempt, Kees Cook raised concerns [2] about this approach
and suggested to copy the name into kernel memory space, perform
validity checks [3] and store as a string referenced from
vm_area_struct.

One big concern is about fork() performance which would need to strdup
anonymous vma names.  Dave Hansen suggested experimenting with
worst-case scenario of forking a process with 64k vmas having longest
possible names [4].  I ran this experiment on an ARM64 Android device
and recorded a worst-case regression of almost 40% when forking such a
process.

This regression is addressed in the followup patch which replaces the
pointer to a name with a refcounted structure that allows sharing the
name pointer between vmas of the same name.  Instead of duplicating the
string during fork() or when splitting a vma it increments the refcount.

[1] https://lore.kernel.org/linux-mm/20200901161459.11772-4-sumit.semwal@linaro.org/
[2] https://lore.kernel.org/linux-mm/202009031031.D32EF57ED@keescook/
[3] https://lore.kernel.org/linux-mm/202009031022.3834F692@keescook/
[4] https://lore.kernel.org/linux-mm/5d0358ab-8c47-2f5f-8e43-23b89d6a8e95@intel.com/

Changes for prctl(2) manual page (in the options section):

PR_SET_VMA
	Sets an attribute specified in arg2 for virtual memory areas
	starting from the address specified in arg3 and spanning the
	size specified	in arg4. arg5 specifies the value of the attribute
	to be set. Note that assigning an attribute to a virtual memory
	area might prevent it from being merged with adjacent virtual
	memory areas due to the difference in that attribute's value.

	Currently, arg2 must be one of:

	PR_SET_VMA_ANON_NAME
		Set a name for anonymous virtual memory areas. arg5 should
		be a pointer to a null-terminated string containing the
		name. The name length including null byte cannot exceed
		80 bytes. If arg5 is NULL, the name of the appropriate
		anonymous virtual memory areas will be reset. The name
		can contain only printable ascii characters (including
                space), except '[',']','\','$' and '`'.

                This feature is available only if the kernel is built with
                the CONFIG_ANON_VMA_NAME option enabled.

[surenb@google.com: docs: proc.rst: /proc/PID/maps: fix malformed table]
  Link: https://lkml.kernel.org/r/20211123185928.2513763-1-surenb@google.com
[surenb: rebased over v5.15-rc6, replaced userpointer with a kernel copy,
 added input sanitization and CONFIG_ANON_VMA_NAME config. The bulk of the
 work here was done by Colin Cross, therefore, with his permission, keeping
 him as the author]

Link: https://lkml.kernel.org/r/20211019215511.3771969-2-surenb@google.com
Signed-off-by: Colin Cross <ccross@google.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Glauber <jan.glauber@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rob Landley <rob@landley.net>
Cc: "Serge E. Hallyn" <serge.hallyn@ubuntu.com>
Cc: Shaohua Li <shli@fusionio.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h       | 13 +++++++++-
 include/linux/mm_types.h | 64 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 72 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a7e4a9e7d807..7000442984b9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2658,7 +2658,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 extern struct vm_area_struct *vma_merge(struct mm_struct *,
 	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
 	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-	struct mempolicy *, struct vm_userfaultfd_ctx);
+	struct mempolicy *, struct vm_userfaultfd_ctx, const char *);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
 	unsigned long addr, int new_below);
@@ -3391,5 +3391,16 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
 	return 0;
 }
 
+#ifdef CONFIG_ANON_VMA_NAME
+int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+			  unsigned long len_in, const char *name);
+#else
+static inline int
+madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+		      unsigned long len_in, const char *name) {
+	return 0;
+}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c3a6e6209600..799e2ee626b2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -426,11 +426,19 @@ struct vm_area_struct {
 	/*
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree.
+	 *
+	 * For private anonymous mappings, a pointer to a null terminated string
+	 * containing the name given to the vma, or NULL if unnamed.
 	 */
-	struct {
-		struct rb_node rb;
-		unsigned long rb_subtree_last;
-	} shared;
+
+	union {
+		struct {
+			struct rb_node rb;
+			unsigned long rb_subtree_last;
+		} shared;
+		/* Serialized by mmap_sem. */
+		char *anon_name;
+	};
 
 	/*
 	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
@@ -875,4 +883,52 @@ typedef struct {
 	unsigned long val;
 } swp_entry_t;
 
+#ifdef CONFIG_ANON_VMA_NAME
+/*
+ * mmap_lock should be read-locked when calling vma_anon_name() and while using
+ * the returned pointer.
+ */
+extern const char *vma_anon_name(struct vm_area_struct *vma);
+
+/*
+ * mmap_lock should be read-locked for orig_vma->vm_mm.
+ * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be
+ * isolated.
+ */
+extern void dup_vma_anon_name(struct vm_area_struct *orig_vma,
+			      struct vm_area_struct *new_vma);
+
+/*
+ * mmap_lock should be write-locked or vma should have been isolated under
+ * write-locked mmap_lock protection.
+ */
+extern void free_vma_anon_name(struct vm_area_struct *vma);
+
+/* mmap_lock should be read-locked */
+static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
+					 const char *name)
+{
+	const char *vma_name = vma_anon_name(vma);
+
+	/* either both NULL, or pointers to same string */
+	if (vma_name == name)
+		return true;
+
+	return name && vma_name && !strcmp(name, vma_name);
+}
+#else /* CONFIG_ANON_VMA_NAME */
+static inline const char *vma_anon_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
+			      struct vm_area_struct *new_vma) {}
+static inline void free_vma_anon_name(struct vm_area_struct *vma) {}
+static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
+					 const char *name)
+{
+	return true;
+}
+#endif  /* CONFIG_ANON_VMA_NAME */
+
 #endif /* _LINUX_MM_TYPES_H */
-- 
cgit v1.2.3


From 78db3412833dc9c479cd17412035f216cfd01a29 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Fri, 14 Jan 2022 14:06:03 -0800
Subject: mm: add anonymous vma name refcounting

While forking a process with high number (64K) of named anonymous vmas
the overhead caused by strdup() is noticeable.  Experiments with ARM64
Android device show up to 40% performance regression when forking a
process with 64k unpopulated anonymous vmas using the max name lengths
vs the same process with the same number of anonymous vmas having no
name.

Introduce anon_vma_name refcounted structure to avoid the overhead of
copying vma names during fork() and when splitting named anonymous vmas.

When a vma is duplicated, instead of copying the name we increment the
refcount of this structure.  Multiple vmas can point to the same
anon_vma_name as long as they increment the refcount.  The name member
of anon_vma_name structure is assigned at structure allocation time and
is never changed.  If vma name changes then the refcount of the original
structure is dropped, a new anon_vma_name structure is allocated to hold
the new name and the vma pointer is updated to point to the new
structure.

With this approach the fork() performance regressions is reduced 3-4x
times and with usecases using more reasonable number of VMAs (a few
thousand) the regressions is not measurable.

Link: https://lkml.kernel.org/r/20211019215511.3771969-3-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Colin Cross <ccross@google.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Glauber <jan.glauber@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rob Landley <rob@landley.net>
Cc: "Serge E. Hallyn" <serge.hallyn@ubuntu.com>
Cc: Shaohua Li <shli@fusionio.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 799e2ee626b2..449b6eafc695 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -5,6 +5,7 @@
 #include <linux/mm_types_task.h>
 
 #include <linux/auxvec.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/rbtree.h>
@@ -386,6 +387,12 @@ struct vm_userfaultfd_ctx {
 struct vm_userfaultfd_ctx {};
 #endif /* CONFIG_USERFAULTFD */
 
+struct anon_vma_name {
+	struct kref kref;
+	/* The name needs to be at the end because it is dynamically sized. */
+	char name[];
+};
+
 /*
  * This struct describes a virtual memory area. There is one of these
  * per VM-area/task. A VM area is any part of the process virtual memory
@@ -437,7 +444,7 @@ struct vm_area_struct {
 			unsigned long rb_subtree_last;
 		} shared;
 		/* Serialized by mmap_sem. */
-		char *anon_name;
+		struct anon_vma_name *anon_name;
 	};
 
 	/*
-- 
cgit v1.2.3


From 17fca131cee21724ee953a17c185c14e9533af5b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 14 Jan 2022 14:06:07 -0800
Subject: mm: move anon_vma declarations to linux/mm_inline.h

The patch to add anonymous vma names causes a build failure in some
configurations:

  include/linux/mm_types.h: In function 'is_same_vma_anon_name':
  include/linux/mm_types.h:924:37: error: implicit declaration of function 'strcmp' [-Werror=implicit-function-declaration]
    924 |         return name && vma_name && !strcmp(name, vma_name);
        |                                     ^~~~~~
  include/linux/mm_types.h:22:1: note: 'strcmp' is defined in header '<string.h>'; did you forget to '#include <string.h>'?

This should not really be part of linux/mm_types.h in the first place,
as that header is meant to only contain structure defintions and need a
minimum set of indirect includes itself.

While the header clearly includes more than it should at this point,
let's not make it worse by including string.h as well, which would pull
in the expensive (compile-speed wise) fortify-string logic.

Move the new functions into a separate header that only needs to be
included in a couple of locations.

Link: https://lkml.kernel.org/r/20211207125710.2503446-1-arnd@kernel.org
Fixes: "mm: add a field to store names for private anonymous memory"
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Colin Cross <ccross@google.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h  | 48 ---------------------------------------------
 2 files changed, 50 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index e2ec68b0515c..47d96d2647ca 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -4,6 +4,7 @@
 
 #include <linux/huge_mm.h>
 #include <linux/swap.h>
+#include <linux/string.h>
 
 /**
  * folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
@@ -135,4 +136,53 @@ static __always_inline void del_page_from_lru_list(struct page *page,
 {
 	lruvec_del_folio(lruvec, page_folio(page));
 }
+
+#ifdef CONFIG_ANON_VMA_NAME
+/*
+ * mmap_lock should be read-locked when calling vma_anon_name() and while using
+ * the returned pointer.
+ */
+extern const char *vma_anon_name(struct vm_area_struct *vma);
+
+/*
+ * mmap_lock should be read-locked for orig_vma->vm_mm.
+ * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be
+ * isolated.
+ */
+extern void dup_vma_anon_name(struct vm_area_struct *orig_vma,
+			      struct vm_area_struct *new_vma);
+
+/*
+ * mmap_lock should be write-locked or vma should have been isolated under
+ * write-locked mmap_lock protection.
+ */
+extern void free_vma_anon_name(struct vm_area_struct *vma);
+
+/* mmap_lock should be read-locked */
+static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
+					 const char *name)
+{
+	const char *vma_name = vma_anon_name(vma);
+
+	/* either both NULL, or pointers to same string */
+	if (vma_name == name)
+		return true;
+
+	return name && vma_name && !strcmp(name, vma_name);
+}
+#else /* CONFIG_ANON_VMA_NAME */
+static inline const char *vma_anon_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
+			      struct vm_area_struct *new_vma) {}
+static inline void free_vma_anon_name(struct vm_area_struct *vma) {}
+static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
+					 const char *name)
+{
+	return true;
+}
+#endif  /* CONFIG_ANON_VMA_NAME */
+
 #endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 449b6eafc695..4d5fb84eed5e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -890,52 +890,4 @@ typedef struct {
 	unsigned long val;
 } swp_entry_t;
 
-#ifdef CONFIG_ANON_VMA_NAME
-/*
- * mmap_lock should be read-locked when calling vma_anon_name() and while using
- * the returned pointer.
- */
-extern const char *vma_anon_name(struct vm_area_struct *vma);
-
-/*
- * mmap_lock should be read-locked for orig_vma->vm_mm.
- * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be
- * isolated.
- */
-extern void dup_vma_anon_name(struct vm_area_struct *orig_vma,
-			      struct vm_area_struct *new_vma);
-
-/*
- * mmap_lock should be write-locked or vma should have been isolated under
- * write-locked mmap_lock protection.
- */
-extern void free_vma_anon_name(struct vm_area_struct *vma);
-
-/* mmap_lock should be read-locked */
-static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
-					 const char *name)
-{
-	const char *vma_name = vma_anon_name(vma);
-
-	/* either both NULL, or pointers to same string */
-	if (vma_name == name)
-		return true;
-
-	return name && vma_name && !strcmp(name, vma_name);
-}
-#else /* CONFIG_ANON_VMA_NAME */
-static inline const char *vma_anon_name(struct vm_area_struct *vma)
-{
-	return NULL;
-}
-static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
-			      struct vm_area_struct *new_vma) {}
-static inline void free_vma_anon_name(struct vm_area_struct *vma) {}
-static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
-					 const char *name)
-{
-	return true;
-}
-#endif  /* CONFIG_ANON_VMA_NAME */
-
 #endif /* _LINUX_MM_TYPES_H */
-- 
cgit v1.2.3


From 36090def7bad06a6346f86a7cfdbfda2d138cb64 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 14 Jan 2022 14:06:10 -0800
Subject: mm: move tlb_flush_pending inline helpers to mm_inline.h

linux/mm_types.h should only define structure definitions, to make it
cheap to include elsewhere.  The atomic_t helper function definitions
are particularly large, so it's better to move the helpers using those
into the existing linux/mm_inline.h and only include that where needed.

As a follow-up, we may want to go through all the indirect includes in
mm_types.h and reduce them as much as possible.

Link: https://lkml.kernel.org/r/20211207125710.2503446-2-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Colin Cross <ccross@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h        |  45 ----------------
 include/linux/mm_inline.h |  86 +++++++++++++++++++++++++++++++
 include/linux/mm_types.h  | 129 ++++++++++++++++------------------------------
 3 files changed, 131 insertions(+), 129 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7000442984b9..c17e5cfc1e47 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -424,51 +424,6 @@ extern unsigned int kobjsize(const void *objp);
  */
 extern pgprot_t protection_map[16];
 
-/**
- * enum fault_flag - Fault flag definitions.
- * @FAULT_FLAG_WRITE: Fault was a write fault.
- * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
- * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
- * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
- * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
- * @FAULT_FLAG_TRIED: The fault has been tried once.
- * @FAULT_FLAG_USER: The fault originated in userspace.
- * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
- * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
- * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
- *
- * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
- * whether we would allow page faults to retry by specifying these two
- * fault flags correctly.  Currently there can be three legal combinations:
- *
- * (a) ALLOW_RETRY and !TRIED:  this means the page fault allows retry, and
- *                              this is the first try
- *
- * (b) ALLOW_RETRY and TRIED:   this means the page fault allows retry, and
- *                              we've already tried at least once
- *
- * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
- *
- * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
- * be used.  Note that page faults can be allowed to retry for multiple times,
- * in which case we'll have an initial fault with flags (a) then later on
- * continuous faults with flags (b).  We should always try to detect pending
- * signals before a retry to make sure the continuous page faults can still be
- * interrupted if necessary.
- */
-enum fault_flag {
-	FAULT_FLAG_WRITE =		1 << 0,
-	FAULT_FLAG_MKWRITE =		1 << 1,
-	FAULT_FLAG_ALLOW_RETRY =	1 << 2,
-	FAULT_FLAG_RETRY_NOWAIT = 	1 << 3,
-	FAULT_FLAG_KILLABLE =		1 << 4,
-	FAULT_FLAG_TRIED = 		1 << 5,
-	FAULT_FLAG_USER =		1 << 6,
-	FAULT_FLAG_REMOTE =		1 << 7,
-	FAULT_FLAG_INSTRUCTION =	1 << 8,
-	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
-};
-
 /*
  * The default fault flags that should be used by most of the
  * arch-specific page fault handlers.
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 47d96d2647ca..b725839dfe71 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -2,6 +2,7 @@
 #ifndef LINUX_MM_INLINE_H
 #define LINUX_MM_INLINE_H
 
+#include <linux/atomic.h>
 #include <linux/huge_mm.h>
 #include <linux/swap.h>
 #include <linux/string.h>
@@ -185,4 +186,89 @@ static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
 }
 #endif  /* CONFIG_ANON_VMA_NAME */
 
+static inline void init_tlb_flush_pending(struct mm_struct *mm)
+{
+	atomic_set(&mm->tlb_flush_pending, 0);
+}
+
+static inline void inc_tlb_flush_pending(struct mm_struct *mm)
+{
+	atomic_inc(&mm->tlb_flush_pending);
+	/*
+	 * The only time this value is relevant is when there are indeed pages
+	 * to flush. And we'll only flush pages after changing them, which
+	 * requires the PTL.
+	 *
+	 * So the ordering here is:
+	 *
+	 *	atomic_inc(&mm->tlb_flush_pending);
+	 *	spin_lock(&ptl);
+	 *	...
+	 *	set_pte_at();
+	 *	spin_unlock(&ptl);
+	 *
+	 *				spin_lock(&ptl)
+	 *				mm_tlb_flush_pending();
+	 *				....
+	 *				spin_unlock(&ptl);
+	 *
+	 *	flush_tlb_range();
+	 *	atomic_dec(&mm->tlb_flush_pending);
+	 *
+	 * Where the increment if constrained by the PTL unlock, it thus
+	 * ensures that the increment is visible if the PTE modification is
+	 * visible. After all, if there is no PTE modification, nobody cares
+	 * about TLB flushes either.
+	 *
+	 * This very much relies on users (mm_tlb_flush_pending() and
+	 * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
+	 * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
+	 * locks (PPC) the unlock of one doesn't order against the lock of
+	 * another PTL.
+	 *
+	 * The decrement is ordered by the flush_tlb_range(), such that
+	 * mm_tlb_flush_pending() will not return false unless all flushes have
+	 * completed.
+	 */
+}
+
+static inline void dec_tlb_flush_pending(struct mm_struct *mm)
+{
+	/*
+	 * See inc_tlb_flush_pending().
+	 *
+	 * This cannot be smp_mb__before_atomic() because smp_mb() simply does
+	 * not order against TLB invalidate completion, which is what we need.
+	 *
+	 * Therefore we must rely on tlb_flush_*() to guarantee order.
+	 */
+	atomic_dec(&mm->tlb_flush_pending);
+}
+
+static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
+{
+	/*
+	 * Must be called after having acquired the PTL; orders against that
+	 * PTLs release and therefore ensures that if we observe the modified
+	 * PTE we must also observe the increment from inc_tlb_flush_pending().
+	 *
+	 * That is, it only guarantees to return true if there is a flush
+	 * pending for _this_ PTL.
+	 */
+	return atomic_read(&mm->tlb_flush_pending);
+}
+
+static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
+{
+	/*
+	 * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
+	 * for which there is a TLB flush pending in order to guarantee
+	 * we've seen both that PTE modification and the increment.
+	 *
+	 * (no requirement on actually still holding the PTL, that is irrelevant)
+	 */
+	return atomic_read(&mm->tlb_flush_pending) > 1;
+}
+
+
 #endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4d5fb84eed5e..6a89f128c990 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -692,90 +692,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_finish_mmu(struct mmu_gather *tlb);
 
-static inline void init_tlb_flush_pending(struct mm_struct *mm)
-{
-	atomic_set(&mm->tlb_flush_pending, 0);
-}
-
-static inline void inc_tlb_flush_pending(struct mm_struct *mm)
-{
-	atomic_inc(&mm->tlb_flush_pending);
-	/*
-	 * The only time this value is relevant is when there are indeed pages
-	 * to flush. And we'll only flush pages after changing them, which
-	 * requires the PTL.
-	 *
-	 * So the ordering here is:
-	 *
-	 *	atomic_inc(&mm->tlb_flush_pending);
-	 *	spin_lock(&ptl);
-	 *	...
-	 *	set_pte_at();
-	 *	spin_unlock(&ptl);
-	 *
-	 *				spin_lock(&ptl)
-	 *				mm_tlb_flush_pending();
-	 *				....
-	 *				spin_unlock(&ptl);
-	 *
-	 *	flush_tlb_range();
-	 *	atomic_dec(&mm->tlb_flush_pending);
-	 *
-	 * Where the increment if constrained by the PTL unlock, it thus
-	 * ensures that the increment is visible if the PTE modification is
-	 * visible. After all, if there is no PTE modification, nobody cares
-	 * about TLB flushes either.
-	 *
-	 * This very much relies on users (mm_tlb_flush_pending() and
-	 * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
-	 * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
-	 * locks (PPC) the unlock of one doesn't order against the lock of
-	 * another PTL.
-	 *
-	 * The decrement is ordered by the flush_tlb_range(), such that
-	 * mm_tlb_flush_pending() will not return false unless all flushes have
-	 * completed.
-	 */
-}
-
-static inline void dec_tlb_flush_pending(struct mm_struct *mm)
-{
-	/*
-	 * See inc_tlb_flush_pending().
-	 *
-	 * This cannot be smp_mb__before_atomic() because smp_mb() simply does
-	 * not order against TLB invalidate completion, which is what we need.
-	 *
-	 * Therefore we must rely on tlb_flush_*() to guarantee order.
-	 */
-	atomic_dec(&mm->tlb_flush_pending);
-}
-
-static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
-{
-	/*
-	 * Must be called after having acquired the PTL; orders against that
-	 * PTLs release and therefore ensures that if we observe the modified
-	 * PTE we must also observe the increment from inc_tlb_flush_pending().
-	 *
-	 * That is, it only guarantees to return true if there is a flush
-	 * pending for _this_ PTL.
-	 */
-	return atomic_read(&mm->tlb_flush_pending);
-}
-
-static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
-{
-	/*
-	 * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
-	 * for which there is a TLB flush pending in order to guarantee
-	 * we've seen both that PTE modification and the increment.
-	 *
-	 * (no requirement on actually still holding the PTL, that is irrelevant)
-	 */
-	return atomic_read(&mm->tlb_flush_pending) > 1;
-}
-
 struct vm_fault;
 
 /**
@@ -890,4 +806,49 @@ typedef struct {
 	unsigned long val;
 } swp_entry_t;
 
+/**
+ * enum fault_flag - Fault flag definitions.
+ * @FAULT_FLAG_WRITE: Fault was a write fault.
+ * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
+ * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
+ * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
+ * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
+ * @FAULT_FLAG_TRIED: The fault has been tried once.
+ * @FAULT_FLAG_USER: The fault originated in userspace.
+ * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
+ * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
+ * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
+ *
+ * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
+ * whether we would allow page faults to retry by specifying these two
+ * fault flags correctly.  Currently there can be three legal combinations:
+ *
+ * (a) ALLOW_RETRY and !TRIED:  this means the page fault allows retry, and
+ *                              this is the first try
+ *
+ * (b) ALLOW_RETRY and TRIED:   this means the page fault allows retry, and
+ *                              we've already tried at least once
+ *
+ * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
+ *
+ * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
+ * be used.  Note that page faults can be allowed to retry for multiple times,
+ * in which case we'll have an initial fault with flags (a) then later on
+ * continuous faults with flags (b).  We should always try to detect pending
+ * signals before a retry to make sure the continuous page faults can still be
+ * interrupted if necessary.
+ */
+enum fault_flag {
+	FAULT_FLAG_WRITE =		1 << 0,
+	FAULT_FLAG_MKWRITE =		1 << 1,
+	FAULT_FLAG_ALLOW_RETRY =	1 << 2,
+	FAULT_FLAG_RETRY_NOWAIT = 	1 << 3,
+	FAULT_FLAG_KILLABLE =		1 << 4,
+	FAULT_FLAG_TRIED = 		1 << 5,
+	FAULT_FLAG_USER =		1 << 6,
+	FAULT_FLAG_REMOTE =		1 << 7,
+	FAULT_FLAG_INSTRUCTION =	1 << 8,
+	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
+};
+
 #endif /* _LINUX_MM_TYPES_H */
-- 
cgit v1.2.3


From cc6dcfee72509868271d42919a3c1081b6b0dc7e Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Fri, 14 Jan 2022 14:06:18 -0800
Subject: mm: document locking restrictions for vm_operations_struct::close

Add comments for vm_operations_struct::close documenting locking
requirements for this callback and its callers.

Link: https://lkml.kernel.org/r/20211209191325.3069345-2-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Christian Brauner <christian@brauner.io>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Jan Engelhardt <jengelh@inai.de>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tim Murray <timmurray@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c17e5cfc1e47..4d7245e6802a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -532,6 +532,10 @@ enum page_entry_size {
  */
 struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
+	/**
+	 * @close: Called when the VMA is being removed from the MM.
+	 * Context: User context.  May sleep.  Caller holds mmap_lock.
+	 */
 	void (*close)(struct vm_area_struct * area);
 	/* Called any time before splitting to check if it's allowed */
 	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
-- 
cgit v1.2.3


From 08d5b29eac7dd5e6c79b66d390ecbb9219e05931 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 14 Jan 2022 14:06:33 -0800
Subject: mm: ptep_clear() page table helper

We have ptep_get_and_clear() and ptep_get_and_clear_full() helpers to
clear PTE from user page tables, but there is no variant for simple
clear of a present PTE from user page tables without using a low level
pte_clear() which can be either native or para-virtualised.

Add a new ptep_clear() that can be used in common code to clear PTEs
from page table.  We will need this call later in order to add a hook
for page table check.

Link: https://lkml.kernel.org/r/20211221154650.1047963-3-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wei Xu <weixugc@google.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pgtable.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e24d2c992b11..bc8713a76e03 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -258,6 +258,14 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
+#ifndef __HAVE_ARCH_PTEP_CLEAR
+static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
+{
+	pte_clear(mm, addr, ptep);
+}
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long address,
-- 
cgit v1.2.3


From df4e817b710809425d899340dbfa8504a3ca4ba5 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 14 Jan 2022 14:06:37 -0800
Subject: mm: page table check

Check user page table entries at the time they are added and removed.

Allows to synchronously catch memory corruption issues related to double
mapping.

When a pte for an anonymous page is added into page table, we verify
that this pte does not already point to a file backed page, and vice
versa if this is a file backed page that is being added we verify that
this page does not have an anonymous mapping

We also enforce that read-only sharing for anonymous pages is allowed
(i.e.  cow after fork).  All other sharing must be for file pages.

Page table check allows to protect and debug cases where "struct page"
metadata became corrupted for some reason.  For example, when refcnt or
mapcount become invalid.

Link: https://lkml.kernel.org/r/20211221154650.1047963-4-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wei Xu <weixugc@google.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_table_check.h | 147 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 include/linux/page_table_check.h

(limited to 'include/linux')

diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h
new file mode 100644
index 000000000000..38cace1da7b6
--- /dev/null
+++ b/include/linux/page_table_check.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2021, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+#ifndef __LINUX_PAGE_TABLE_CHECK_H
+#define __LINUX_PAGE_TABLE_CHECK_H
+
+#ifdef CONFIG_PAGE_TABLE_CHECK
+#include <linux/jump_label.h>
+
+extern struct static_key_true page_table_check_disabled;
+extern struct page_ext_operations page_table_check_ops;
+
+void __page_table_check_zero(struct page *page, unsigned int order);
+void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
+				  pte_t pte);
+void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
+				  pmd_t pmd);
+void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
+				  pud_t pud);
+void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte);
+void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
+				pmd_t *pmdp, pmd_t pmd);
+void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
+				pud_t *pudp, pud_t pud);
+
+static inline void page_table_check_alloc(struct page *page, unsigned int order)
+{
+	if (static_branch_likely(&page_table_check_disabled))
+		return;
+
+	__page_table_check_zero(page, order);
+}
+
+static inline void page_table_check_free(struct page *page, unsigned int order)
+{
+	if (static_branch_likely(&page_table_check_disabled))
+		return;
+
+	__page_table_check_zero(page, order);
+}
+
+static inline void page_table_check_pte_clear(struct mm_struct *mm,
+					      unsigned long addr, pte_t pte)
+{
+	if (static_branch_likely(&page_table_check_disabled))
+		return;
+
+	__page_table_check_pte_clear(mm, addr, pte);
+}
+
+static inline void page_table_check_pmd_clear(struct mm_struct *mm,
+					      unsigned long addr, pmd_t pmd)
+{
+	if (static_branch_likely(&page_table_check_disabled))
+		return;
+
+	__page_table_check_pmd_clear(mm, addr, pmd);
+}
+
+static inline void page_table_check_pud_clear(struct mm_struct *mm,
+					      unsigned long addr, pud_t pud)
+{
+	if (static_branch_likely(&page_table_check_disabled))
+		return;
+
+	__page_table_check_pud_clear(mm, addr, pud);
+}
+
+static inline void page_table_check_pte_set(struct mm_struct *mm,
+					    unsigned long addr, pte_t *ptep,
+					    pte_t pte)
+{
+	if (static_branch_likely(&page_table_check_disabled))
+		return;
+
+	__page_table_check_pte_set(mm, addr, ptep, pte);
+}
+
+static inline void page_table_check_pmd_set(struct mm_struct *mm,
+					    unsigned long addr, pmd_t *pmdp,
+					    pmd_t pmd)
+{
+	if (static_branch_likely(&page_table_check_disabled))
+		return;
+
+	__page_table_check_pmd_set(mm, addr, pmdp, pmd);
+}
+
+static inline void page_table_check_pud_set(struct mm_struct *mm,
+					    unsigned long addr, pud_t *pudp,
+					    pud_t pud)
+{
+	if (static_branch_likely(&page_table_check_disabled))
+		return;
+
+	__page_table_check_pud_set(mm, addr, pudp, pud);
+}
+
+#else
+
+static inline void page_table_check_alloc(struct page *page, unsigned int order)
+{
+}
+
+static inline void page_table_check_free(struct page *page, unsigned int order)
+{
+}
+
+static inline void page_table_check_pte_clear(struct mm_struct *mm,
+					      unsigned long addr, pte_t pte)
+{
+}
+
+static inline void page_table_check_pmd_clear(struct mm_struct *mm,
+					      unsigned long addr, pmd_t pmd)
+{
+}
+
+static inline void page_table_check_pud_clear(struct mm_struct *mm,
+					      unsigned long addr, pud_t pud)
+{
+}
+
+static inline void page_table_check_pte_set(struct mm_struct *mm,
+					    unsigned long addr, pte_t *ptep,
+					    pte_t pte)
+{
+}
+
+static inline void page_table_check_pmd_set(struct mm_struct *mm,
+					    unsigned long addr, pmd_t *pmdp,
+					    pmd_t pmd)
+{
+}
+
+static inline void page_table_check_pud_set(struct mm_struct *mm,
+					    unsigned long addr, pud_t *pudp,
+					    pud_t pud)
+{
+}
+
+#endif /* CONFIG_PAGE_TABLE_CHECK */
+#endif /* __LINUX_PAGE_TABLE_CHECK_H */
-- 
cgit v1.2.3


From 020e87650af9f43683546729f959fdc78422a4b7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 14 Jan 2022 14:06:44 -0800
Subject: mm: remove last argument of reuse_swap_page()

None of the callers care about the total_map_swapcount() any more.

Link: https://lkml.kernel.org/r/20211220205943.456187-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d1ea44b31f19..bdccbf1efa61 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -514,7 +514,7 @@ extern int __swp_swapcount(swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
-extern bool reuse_swap_page(struct page *, int *);
+extern bool reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
@@ -680,8 +680,8 @@ static inline int swp_swapcount(swp_entry_t entry)
 	return 0;
 }
 
-#define reuse_swap_page(page, total_map_swapcount) \
-	(page_trans_huge_mapcount(page, total_map_swapcount) == 1)
+#define reuse_swap_page(page) \
+	(page_trans_huge_mapcount(page, NULL) == 1)
 
 static inline int try_to_free_swap(struct page *page)
 {
-- 
cgit v1.2.3


From d08d2b62510e2407cf939e693aefd179dc114913 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 14 Jan 2022 14:06:51 -0800
Subject: mm: remove the total_mapcount argument from
 page_trans_huge_mapcount()

All callers pass NULL, so we can stop calculating the value we would
store in it.

Link: https://lkml.kernel.org/r/20211220205943.456187-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h   | 10 +++-------
 include/linux/swap.h |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4d7245e6802a..cef65f9cbdf2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -799,19 +799,15 @@ static inline int page_mapcount(struct page *page)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int total_mapcount(struct page *page);
-int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
+int page_trans_huge_mapcount(struct page *page);
 #else
 static inline int total_mapcount(struct page *page)
 {
 	return page_mapcount(page);
 }
-static inline int page_trans_huge_mapcount(struct page *page,
-					   int *total_mapcount)
+static inline int page_trans_huge_mapcount(struct page *page)
 {
-	int mapcount = page_mapcount(page);
-	if (total_mapcount)
-		*total_mapcount = mapcount;
-	return mapcount;
+	return page_mapcount(page);
 }
 #endif
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index bdccbf1efa61..1d38d9475c4d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -681,7 +681,7 @@ static inline int swp_swapcount(swp_entry_t entry)
 }
 
 #define reuse_swap_page(page) \
-	(page_trans_huge_mapcount(page, NULL) == 1)
+	(page_trans_huge_mapcount(page) == 1)
 
 static inline int try_to_free_swap(struct page *page)
 {
-- 
cgit v1.2.3


From a421ef303008b0ceee2cfc625c3246fa7654b0ca Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 14 Jan 2022 14:07:07 -0800
Subject: mm: allow !GFP_KERNEL allocations for kvmalloc

Support for GFP_NO{FS,IO} and __GFP_NOFAIL has been implemented by
previous patches so we can allow the support for kvmalloc.  This will
allow some external users to simplify or completely remove their
helpers.

GFP_NOWAIT semantic hasn't been supported so far but it hasn't been
explicitly documented so let's add a note about that.

ceph_kvmalloc is the first helper to be dropped and changed to kvmalloc.

Link: https://lkml.kernel.org/r/20211122153233.9924-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Neil Brown <neilb@suse.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ceph/libceph.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 409d8c29bc4f..309acbcb5a8a 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -295,7 +295,6 @@ extern bool libceph_compatible(void *data);
 
 extern const char *ceph_msg_type_name(int type);
 extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
-extern void *ceph_kvmalloc(size_t size, gfp_t flags);
 
 struct fs_parameter;
 struct fc_log;
-- 
cgit v1.2.3


From 4034247a0d6ab281ba3293798ce67af494d86129 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 14 Jan 2022 14:07:14 -0800
Subject: mm: introduce memalloc_retry_wait()

Various places in the kernel - largely in filesystems - respond to a
memory allocation failure by looping around and re-trying.  Some of
these cannot conveniently use __GFP_NOFAIL, for reasons such as:

 - a GFP_ATOMIC allocation, which __GFP_NOFAIL doesn't work on
 - a need to check for the process being signalled between failures
 - the possibility that other recovery actions could be performed
 - the allocation is quite deep in support code, and passing down an
   extra flag to say if __GFP_NOFAIL is wanted would be clumsy.

Many of these currently use congestion_wait() which (in almost all
cases) simply waits the given timeout - congestion isn't tracked for
most devices.

It isn't clear what the best delay is for loops, but it is clear that
the various filesystems shouldn't be responsible for choosing a timeout.

This patch introduces memalloc_retry_wait() with takes on that
responsibility.  Code that wants to retry a memory allocation can call
this function passing the GFP flags that were used.  It will wait
however is appropriate.

For now, it only considers __GFP_NORETRY and whatever
gfpflags_allow_blocking() tests.  If blocking is allowed without
__GFP_NORETRY, then alloc_page either made some reclaim progress, or
waited for a while, before failing.  So there is no need for much
further waiting.  memalloc_retry_wait() will wait until the current
jiffie ends.  If this condition is not met, then alloc_page() won't have
waited much if at all.  In that case memalloc_retry_wait() waits about
200ms.  This is the delay that most current loops uses.

linux/sched/mm.h needs to be included in some files now,
but linux/backing-dev.h does not.

Link: https://lkml.kernel.org/r/163754371968.13692.1277530886009912421@noble.neil.brown.name
Signed-off-by: NeilBrown <neilb@suse.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Chao Yu <chao@kernel.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/mm.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index aca874d33fe6..aa5f09ca5bcf 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -214,6 +214,32 @@ static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
 static inline void fs_reclaim_release(gfp_t gfp_mask) { }
 #endif
 
+/* Any memory-allocation retry loop should use
+ * memalloc_retry_wait(), and pass the flags for the most
+ * constrained allocation attempt that might have failed.
+ * This provides useful documentation of where loops are,
+ * and a central place to fine tune the waiting as the MM
+ * implementation changes.
+ */
+static inline void memalloc_retry_wait(gfp_t gfp_flags)
+{
+	/* We use io_schedule_timeout because waiting for memory
+	 * typically included waiting for dirty pages to be
+	 * written out, which requires IO.
+	 */
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	gfp_flags = current_gfp_context(gfp_flags);
+	if (gfpflags_allow_blocking(gfp_flags) &&
+	    !(gfp_flags & __GFP_NORETRY))
+		/* Probably waited already, no need for much more */
+		io_schedule_timeout(1);
+	else
+		/* Probably didn't wait, and has now released a lock,
+		 * so now is a good time to wait
+		 */
+		io_schedule_timeout(HZ/50);
+}
+
 /**
  * might_alloc - Mark possible allocation sites
  * @gfp_mask: gfp_t flags that would be used to allocate
-- 
cgit v1.2.3


From 1611f74a94ba2e0f2d25b75008ed8e76e122097a Mon Sep 17 00:00:00 2001
From: Changcheng Deng <deng.changcheng@zte.com.cn>
Date: Fri, 14 Jan 2022 14:07:21 -0800
Subject: mm: fix boolreturn.cocci warning

Return statements in functions returning bool should use true/false
instead of 1/0.

Link: https://lkml.kernel.org/r/20211126073327.74815-1-deng.changcheng@zte.com.cn
Signed-off-by: Changcheng Deng <deng.changcheng@zte.com.cn>
Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index b5f14d581113..18423c2157e8 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -383,7 +383,7 @@ static __always_inline int TestClearPage##uname(struct page *page)	\
 	TESTCLEARFLAG(uname, lname, policy)
 
 #define TESTPAGEFLAG_FALSE(uname, lname)				\
-static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \
+static inline bool folio_test_##lname(const struct folio *folio) { return false; } \
 static inline int Page##uname(const struct page *page) { return 0; }
 
 #define SETPAGEFLAG_NOOP(uname, lname)					\
-- 
cgit v1.2.3


From be1a13eb51077b2ec5f7f4306f93dfece503a3f1 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 14 Jan 2022 14:07:27 -0800
Subject: mm: drop node from alloc_pages_vma

alloc_pages_vma is meant to allocate a page with a vma specific memory
policy.  The initial node parameter is always a local node so it is
pointless to waste a function argument for this.  Drop the parameter.

Link: https://lkml.kernel.org/r/YaSnlv4QpryEpesG@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Ben Widawsky <ben.widawsky@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 8fcc38467af6..78b58448f796 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -598,9 +598,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order);
 struct folio *folio_alloc(gfp_t gfp, unsigned order);
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 			struct vm_area_struct *vma, unsigned long addr,
-			int node, bool hugepage);
+			bool hugepage);
 #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
-	alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
+	alloc_pages_vma(gfp_mask, order, vma, addr, true)
 #else
 static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
@@ -610,14 +610,14 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
 {
 	return __folio_alloc_node(gfp, order, numa_node_id());
 }
-#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, false)\
 	alloc_pages(gfp_mask, order)
 #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
 	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 #define alloc_page_vma(gfp_mask, vma, addr)			\
-	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
+	alloc_pages_vma(gfp_mask, 0, vma, addr, false)
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
-- 
cgit v1.2.3


From 04a536bfbd0f885338eecc2a4503dfca50ac94dd Mon Sep 17 00:00:00 2001
From: Miles Chen <miles.chen@mediatek.com>
Date: Fri, 14 Jan 2022 14:07:30 -0800
Subject: include/linux/gfp.h: further document GFP_DMA32

kmalloc(..., GFP_DMA32) does not return DMA32 memory because the DMA32
kmalloc cache array is not implemented.  (Reason: there is no such user
in kernel).

Put a short comment about this so people can understand this by reading
the comment.

[1] https://lists.linuxfoundation.org/pipermail/iommu/2018-December/031696.html

Link: https://lkml.kernel.org/r/20211207093610.6406-1-miles.chen@mediatek.com
Signed-off-by: Miles Chen <miles.chen@mediatek.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 78b58448f796..80f63c862be5 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -302,7 +302,9 @@ struct vm_area_struct;
  * lowest zone as a type of emergency reserve.
  *
  * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
- * address.
+ * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory
+ * because the DMA32 kmalloc cache array is not implemented.
+ * (Reason: there is no such user in kernel).
  *
  * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
  * do not need to be directly accessible by the kernel but that cannot
-- 
cgit v1.2.3


From 62b3107073646e0946bd97ff926832bafb846d17 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Fri, 14 Jan 2022 14:07:37 -0800
Subject: mm_zone: add function to check if managed dma zone exists

Patch series "Handle warning of allocation failure on DMA zone w/o
managed pages", v4.

**Problem observed:
On x86_64, when crash is triggered and entering into kdump kernel, page
allocation failure can always be seen.

 ---------------------------------
 DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations
 swapper/0: page allocation failure: order:5, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0
 CPU: 0 PID: 1 Comm: swapper/0
 Call Trace:
  dump_stack+0x7f/0xa1
  warn_alloc.cold+0x72/0xd6
  ......
  __alloc_pages+0x24d/0x2c0
  ......
  dma_atomic_pool_init+0xdb/0x176
  do_one_initcall+0x67/0x320
  ? rcu_read_lock_sched_held+0x3f/0x80
  kernel_init_freeable+0x290/0x2dc
  ? rest_init+0x24f/0x24f
  kernel_init+0xa/0x111
  ret_from_fork+0x22/0x30
 Mem-Info:
 ------------------------------------

***Root cause:
In the current kernel, it assumes that DMA zone must have managed pages
and try to request pages if CONFIG_ZONE_DMA is enabled. While this is not
always true. E.g in kdump kernel of x86_64, only low 1M is presented and
locked down at very early stage of boot, so that this low 1M won't be
added into buddy allocator to become managed pages of DMA zone. This
exception will always cause page allocation failure if page is requested
from DMA zone.

***Investigation:
This failure happens since below commit merged into linus's tree.
  1a6a9044b967 x86/setup: Remove CONFIG_X86_RESERVE_LOW and reservelow= options
  23721c8e92f7 x86/crash: Remove crash_reserve_low_1M()
  f1d4d47c5851 x86/setup: Always reserve the first 1M of RAM
  7c321eb2b843 x86/kdump: Remove the backup region handling
  6f599d84231f x86/kdump: Always reserve the low 1M when the crashkernel option is specified

Before them, on x86_64, the low 640K area will be reused by kdump kernel.
So in kdump kernel, the content of low 640K area is copied into a backup
region for dumping before jumping into kdump. Then except of those firmware
reserved region in [0, 640K], the left area will be added into buddy
allocator to become available managed pages of DMA zone.

However, after above commits applied, in kdump kernel of x86_64, the low
1M is reserved by memblock, but not released to buddy allocator. So any
later page allocation requested from DMA zone will fail.

At the beginning, if crashkernel is reserved, the low 1M need be locked
down because AMD SME encrypts memory making the old backup region
mechanims impossible when switching into kdump kernel.

Later, it was also observed that there are BIOSes corrupting memory
under 1M. To solve this, in commit f1d4d47c5851, the entire region of
low 1M is always reserved after the real mode trampoline is allocated.

Besides, recently, Intel engineer mentioned their TDX (Trusted domain
extensions) which is under development in kernel also needs to lock down
the low 1M. So we can't simply revert above commits to fix the page allocation
failure from DMA zone as someone suggested.

***Solution:
Currently, only DMA atomic pool and dma-kmalloc will initialize and
request page allocation with GFP_DMA during bootup.

So only initializ DMA atomic pool when DMA zone has available managed
pages, otherwise just skip the initialization.

For dma-kmalloc(), for the time being, let's mute the warning of
allocation failure if requesting pages from DMA zone while no manged
pages.  Meanwhile, change code to use dma_alloc_xx/dma_map_xx API to
replace kmalloc(GFP_DMA), or do not use GFP_DMA when calling kmalloc() if
not necessary.  Christoph is posting patches to fix those under
drivers/scsi/.  Finally, we can remove the need of dma-kmalloc() as people
suggested.

This patch (of 3):

In some places of the current kernel, it assumes that dma zone must have
managed pages if CONFIG_ZONE_DMA is enabled.  While this is not always
true.  E.g in kdump kernel of x86_64, only low 1M is presented and locked
down at very early stage of boot, so that there's no managed pages at all
in DMA zone.  This exception will always cause page allocation failure if
page is requested from DMA zone.

Here add function has_managed_dma() and the relevant helper functions to
check if there's DMA zone with managed pages.  It will be used in later
patches.

Link: https://lkml.kernel.org/r/20211223094435.248523-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20211223094435.248523-2-bhe@redhat.com
Fixes: 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified")
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: John Donnelly  <john.p.donnelly@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 936dc0b6c226..aed44e9b5d89 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1047,6 +1047,15 @@ static inline int is_highmem_idx(enum zone_type idx)
 #endif
 }
 
+#ifdef CONFIG_ZONE_DMA
+bool has_managed_dma(void);
+#else
+static inline bool has_managed_dma(void)
+{
+	return false;
+}
+#endif
+
 /**
  * is_highmem - helper function to quickly check if a struct zone is a
  *              highmem zone or not.  This is an attempt to keep references
-- 
cgit v1.2.3


From f47761999052b1cc987dd3e3d3adf47997358fc0 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Fri, 14 Jan 2022 14:07:48 -0800
Subject: hugetlb: add hugetlb.*.numa_stat file

For hugetlb backed jobs/VMs it's critical to understand the numa
information for the memory backing these jobs to deliver optimal
performance.

Currently this technically can be queried from /proc/self/numa_maps, but
there are significant issues with that.  Namely:

1. Memory can be mapped or unmapped.

2. numa_maps are per process and need to be aggregated across all
   processes in the cgroup.  For shared memory this is more involved as
   the userspace needs to make sure it doesn't double count shared
   mappings.

3. I believe querying numa_maps needs to hold the mmap_lock which adds
   to the contention on this lock.

For these reasons I propose simply adding hugetlb.*.numa_stat file,
   which shows the numa information of the cgroup similarly to
   memory.numa_stat.

On cgroup-v2:
   cat /sys/fs/cgroup/unified/test/hugetlb.2MB.numa_stat
   total=2097152 N0=2097152 N1=0

On cgroup-v1:
   cat /sys/fs/cgroup/hugetlb/test/hugetlb.2MB.numa_stat
   total=2097152 N0=2097152 N1=0
   hierarichal_total=2097152 N0=2097152 N1=0

This patch was tested manually by allocating hugetlb memory and querying
the hugetlb.*.numa_stat file of the cgroup and its parents.

[colin.i.king@googlemail.com: fix spelling mistake "hierarichal" -> "hierarchical"]
  Link: https://lkml.kernel.org/r/20211125090635.23508-1-colin.i.king@gmail.com
[keescook@chromium.org: fix copy/paste array assignment]
  Link: https://lkml.kernel.org/r/20211203065647.2819707-1-keescook@chromium.org

Link: https://lkml.kernel.org/r/20211123001020.4083653-1-almasrymina@google.com
Signed-off-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jue Wang <juew@google.com>
Cc: Yang Yao <ygyao@google.com>
Cc: Joanna Li <joannali@google.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h        | 4 ++--
 include/linux/hugetlb_cgroup.h | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 00351ccb49a3..d1897a69c540 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -622,8 +622,8 @@ struct hstate {
 #endif
 #ifdef CONFIG_CGROUP_HUGETLB
 	/* cgroup control files */
-	struct cftype cgroup_files_dfl[7];
-	struct cftype cgroup_files_legacy[9];
+	struct cftype cgroup_files_dfl[8];
+	struct cftype cgroup_files_legacy[10];
 #endif
 	char name[HSTATE_NAME_LEN];
 };
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index ba025ae27882..379344828e78 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -36,6 +36,11 @@ enum hugetlb_memory_event {
 	HUGETLB_NR_MEMORY_EVENTS,
 };
 
+struct hugetlb_cgroup_per_node {
+	/* hugetlb usage in pages over all hstates. */
+	unsigned long usage[HUGE_MAX_HSTATE];
+};
+
 struct hugetlb_cgroup {
 	struct cgroup_subsys_state css;
 
@@ -57,6 +62,8 @@ struct hugetlb_cgroup {
 
 	/* Handle for "hugetlb.events.local" */
 	struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
+
+	struct hugetlb_cgroup_per_node *nodeinfo[];
 };
 
 static inline struct hugetlb_cgroup *
-- 
cgit v1.2.3


From e9ea874a8ffb0f8ebed4f4981531a32c5b663d79 Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Fri, 14 Jan 2022 14:07:55 -0800
Subject: mm/vmstat: add events for THP max_ptes_* exceeds

There are interfaces to adjust max_ptes_none, max_ptes_swap,
max_ptes_shared values, see
  /sys/kernel/mm/transparent_hugepage/khugepaged/.

But system administrator may not know which value is the best.  So Add
those events to support adjusting max_ptes_* to suitable values.

For example, if default max_ptes_swap value causes too much failures,
and system uses zram whose IO is fast, administrator could increase
max_ptes_swap until THP_SCAN_EXCEED_SWAP_PTE not increase anymore.

Link: https://lkml.kernel.org/r/20211225094036.574157-1-yang.yang29@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Saravanan D <saravanand@fb.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index a185cc75ff52..7b2363388bfa 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -98,6 +98,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		THP_SPLIT_PAGE_FAILED,
 		THP_DEFERRED_SPLIT_PAGE,
 		THP_SPLIT_PMD,
+		THP_SCAN_EXCEED_NONE_PTE,
+		THP_SCAN_EXCEED_SWAP_PTE,
+		THP_SCAN_EXCEED_SHARED_PTE,
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 		THP_SPLIT_PUD,
 #endif
-- 
cgit v1.2.3


From e4b424b7ec8791087375bb1f2480a3ba05d21e0b Mon Sep 17 00:00:00 2001
From: Gang Li <ligang.bdlg@bytedance.com>
Date: Fri, 14 Jan 2022 14:08:07 -0800
Subject: vmscan: make drop_slab_node static

drop_slab_node is only used in drop_slab.  So remove it's declaration
from header file and add keyword static for it's definition.

Link: https://lkml.kernel.org/r/20211111062445.5236-1-ligang.bdlg@bytedance.com
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cef65f9cbdf2..eb67eb699b78 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3122,7 +3122,6 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
 #endif
 
 void drop_slab(void);
-void drop_slab_node(int nid);
 
 #ifndef CONFIG_MMU
 #define randomize_va_space 0
-- 
cgit v1.2.3


From c6018b4b254971863bd0ad36bb5e7d0fa0f0ddb0 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Fri, 14 Jan 2022 14:08:17 -0800
Subject: mm/mempolicy: add set_mempolicy_home_node syscall

This syscall can be used to set a home node for the MPOL_BIND and
MPOL_PREFERRED_MANY memory policy.  Users should use this syscall after
setting up a memory policy for the specified range as shown below.

  mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp,
        new_nodes->size + 1, 0);
  sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size,
				home_node, 0);

The syscall allows specifying a home node/preferred node from which
kernel will fulfill memory allocation requests first.

For address range with MPOL_BIND memory policy, if nodemask specifies
more than one node, page allocations will come from the node in the
nodemask with sufficient free memory that is closest to the home
node/preferred node.

For MPOL_PREFERRED_MANY if the nodemask specifies more than one node,
page allocation will come from the node in the nodemask with sufficient
free memory that is closest to the home node/preferred node.  If there
is not enough memory in all the nodes specified in the nodemask, the
allocation will be attempted from the closest numa node to the home node
in the system.

This helps applications to hint at a memory allocation preference node
and fallback to _only_ a set of nodes if the memory is not available on
the preferred node.  Fallback allocation is attempted from the node
which is nearest to the preferred node.

This helps applications to have control on memory allocation numa nodes
and avoids default fallback to slow memory NUMA nodes.  For example a
system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of
slow memory

 new_nodes = numa_bitmask_alloc(nr_nodes);

 numa_bitmask_setbit(new_nodes, 1);
 numa_bitmask_setbit(new_nodes, 2);
 numa_bitmask_setbit(new_nodes, 3);

 p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0);
 mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp,  new_nodes->size + 1, 0);

 sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0);

This will allocate from nodes closer to node 2 and will make sure the
kernel will only allocate from nodes 1, 2, and 3.  Memory will not be
allocated from slow memory nodes 10, 11, and 12.  This differs from
default MPOL_BIND behavior in that with default MPOL_BIND the allocation
will be attempted from node closer to the local node.  One of the
reasons to specify a home node is to allow allocations from cpu less
NUMA node and its nearby NUMA nodes.

With MPOL_PREFERRED_MANY on the other hand will first try to allocate
from the closest node to node 2 from the node list 1, 2 and 3.  If those
nodes don't have enough memory, kernel will allocate from slow memory
node 10, 11 and 12 which ever is closer to node 2.

Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Ben Widawsky <ben.widawsky@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 3c7595e81150..668389b4b53d 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -46,6 +46,7 @@ struct mempolicy {
 	unsigned short mode; 	/* See MPOL_* above */
 	unsigned short flags;	/* See set_mempolicy() MPOL_F_* above */
 	nodemask_t nodes;	/* interleave/bind/perfer */
+	int home_node;		/* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
 
 	union {
 		nodemask_t cpuset_mems_allowed;	/* relative to these nodes */
-- 
cgit v1.2.3


From 21b084fdf2a49ca1634e8e360e9ab6f9ff0dee11 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Fri, 14 Jan 2022 14:08:21 -0800
Subject: mm/mempolicy: wire up syscall set_mempolicy_home_node

Link: https://lkml.kernel.org/r/20211202123810.267175-4-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Ben Widawsky <ben.widawsky@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 528a478dbda8..819c0cb00b6d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1057,6 +1057,9 @@ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type ru
 		const void __user *rule_attr, __u32 flags);
 asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
 asmlinkage long sys_memfd_secret(unsigned int flags);
+asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
+					    unsigned long home_node,
+					    unsigned long flags);
 
 /*
  * Architecture-specific system calls
-- 
cgit v1.2.3


From c9fdc4d5487a16bd1f003fc8b66e91f88efb50e6 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Fri, 14 Jan 2022 14:09:06 -0800
Subject: mm/hwpoison: remove MF_MSG_BUDDY_2ND and MF_MSG_POISONED_HUGE

These action_page_types are no longer used, so remove them.

Link: https://lkml.kernel.org/r/20211115084006.3728254-3-naoya.horiguchi@linux.dev
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Acked-by: Yang Shi <shy828301@gmail.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ding Hui <dinghui@sangfor.com.cn>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index eb67eb699b78..7f594da84aca 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3201,7 +3201,6 @@ enum mf_action_page_type {
 	MF_MSG_KERNEL_HIGH_ORDER,
 	MF_MSG_SLAB,
 	MF_MSG_DIFFERENT_COMPOUND,
-	MF_MSG_POISONED_HUGE,
 	MF_MSG_HUGE,
 	MF_MSG_FREE_HUGE,
 	MF_MSG_NON_PMD_HUGE,
@@ -3216,7 +3215,6 @@ enum mf_action_page_type {
 	MF_MSG_CLEAN_LRU,
 	MF_MSG_TRUNCATED_LRU,
 	MF_MSG_BUDDY,
-	MF_MSG_BUDDY_2ND,
 	MF_MSG_DAX,
 	MF_MSG_UNSPLIT_THP,
 	MF_MSG_UNKNOWN,
-- 
cgit v1.2.3


From bf181c582588f8f7406d52f2ee228539b465f173 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Fri, 14 Jan 2022 14:09:09 -0800
Subject: mm/hwpoison: fix unpoison_memory()

After recent soft-offline rework, error pages can be taken off from
buddy allocator, but the existing unpoison_memory() does not properly
undo the operation.  Moreover, due to the recent change on
__get_hwpoison_page(), get_page_unless_zero() is hardly called for
hwpoisoned pages.  So __get_hwpoison_page() highly likely returns -EBUSY
(meaning to fail to grab page refcount) and unpoison just clears
PG_hwpoison without releasing a refcount.  That does not lead to a
critical issue like kernel panic, but unpoisoned pages never get back to
buddy (leaked permanently), which is not good.

To (partially) fix this, we need to identify "taken off" pages from
other types of hwpoisoned pages.  We can't use refcount or page flags
for this purpose, so a pseudo flag is defined by hacking ->private
field.  Someone might think that put_page() is enough to cancel
taken-off pages, but the normal free path contains some operations not
suitable for the current purpose, and can fire VM_BUG_ON().

Note that unpoison_memory() is now supposed to be cancel hwpoison events
injected only by madvise() or
/sys/devices/system/memory/{hard,soft}_offline_page, not by MCE
injection, so please don't try to use unpoison when testing with MCE
injection.

[lkp@intel.com: report build failure for ARCH=i386]

Link: https://lkml.kernel.org/r/20211115084006.3728254-4-naoya.horiguchi@linux.dev
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Ding Hui <dinghui@sangfor.com.cn>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h         | 1 +
 include/linux/page-flags.h | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7f594da84aca..d4fb49a5d60d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3174,6 +3174,7 @@ enum mf_flags {
 	MF_ACTION_REQUIRED = 1 << 1,
 	MF_MUST_KILL = 1 << 2,
 	MF_SOFT_OFFLINE = 1 << 3,
+	MF_UNPOISON = 1 << 4,
 };
 extern int memory_failure(unsigned long pfn, int flags);
 extern void memory_failure_queue(unsigned long pfn, int flags);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 18423c2157e8..7e2b90dc7d3f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -522,7 +522,11 @@ PAGEFLAG_FALSE(Uncached, uncached)
 PAGEFLAG(HWPoison, hwpoison, PF_ANY)
 TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
 #define __PG_HWPOISON (1UL << PG_hwpoison)
+#define MAGIC_HWPOISON	0x48575053U	/* HWPS */
+extern void SetPageHWPoisonTakenOff(struct page *page);
+extern void ClearPageHWPoisonTakenOff(struct page *page);
 extern bool take_page_off_buddy(struct page *page);
+extern bool put_page_back_buddy(struct page *page);
 #else
 PAGEFLAG_FALSE(HWPoison, hwpoison)
 #define __PG_HWPOISON 0
-- 
cgit v1.2.3


From 5ee2fa2f063649570c702164f47a558a3432dd9e Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 14 Jan 2022 14:09:16 -0800
Subject: mm/rmap: fix potential batched TLB flush race

In theory, the following race is possible for batched TLB flushing.

  CPU0                               CPU1
  ----                               ----
  shrink_page_list()
                                     unmap
                                       zap_pte_range()
                                         flush_tlb_batched_pending()
                                           flush_tlb_mm()
    try_to_unmap()
      set_tlb_ubc_flush_pending()
        mm->tlb_flush_batched = true
                                           mm->tlb_flush_batched = false

After the TLB is flushed on CPU1 via flush_tlb_mm() and before
mm->tlb_flush_batched is set to false, some PTE is unmapped on CPU0 and
the TLB flushing is pended.  Then the pended TLB flushing will be lost.
Although both set_tlb_ubc_flush_pending() and
flush_tlb_batched_pending() are called with PTL locked, different PTL
instances may be used.

Because the race window is really small, and the lost TLB flushing will
cause problem only if a TLB entry is inserted before the unmapping in
the race window, the race is only theoretical.  But the fix is simple
and cheap too.

Syzbot has reported this too as follows:

    ==================================================================
    BUG: KCSAN: data-race in flush_tlb_batched_pending / try_to_unmap_one

    write to 0xffff8881072cfbbc of 1 bytes by task 17406 on cpu 1:
     flush_tlb_batched_pending+0x5f/0x80 mm/rmap.c:691
     madvise_free_pte_range+0xee/0x7d0 mm/madvise.c:594
     walk_pmd_range mm/pagewalk.c:128 [inline]
     walk_pud_range mm/pagewalk.c:205 [inline]
     walk_p4d_range mm/pagewalk.c:240 [inline]
     walk_pgd_range mm/pagewalk.c:277 [inline]
     __walk_page_range+0x981/0x1160 mm/pagewalk.c:379
     walk_page_range+0x131/0x300 mm/pagewalk.c:475
     madvise_free_single_vma mm/madvise.c:734 [inline]
     madvise_dontneed_free mm/madvise.c:822 [inline]
     madvise_vma mm/madvise.c:996 [inline]
     do_madvise+0xe4a/0x1140 mm/madvise.c:1202
     __do_sys_madvise mm/madvise.c:1228 [inline]
     __se_sys_madvise mm/madvise.c:1226 [inline]
     __x64_sys_madvise+0x5d/0x70 mm/madvise.c:1226
     do_syscall_x64 arch/x86/entry/common.c:50 [inline]
     do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80
     entry_SYSCALL_64_after_hwframe+0x44/0xae

    write to 0xffff8881072cfbbc of 1 bytes by task 71 on cpu 0:
     set_tlb_ubc_flush_pending mm/rmap.c:636 [inline]
     try_to_unmap_one+0x60e/0x1220 mm/rmap.c:1515
     rmap_walk_anon+0x2fb/0x470 mm/rmap.c:2301
     try_to_unmap+0xec/0x110
     shrink_page_list+0xe91/0x2620 mm/vmscan.c:1719
     shrink_inactive_list+0x3fb/0x730 mm/vmscan.c:2394
     shrink_list mm/vmscan.c:2621 [inline]
     shrink_lruvec+0x3c9/0x710 mm/vmscan.c:2940
     shrink_node_memcgs+0x23e/0x410 mm/vmscan.c:3129
     shrink_node+0x8f6/0x1190 mm/vmscan.c:3252
     kswapd_shrink_node mm/vmscan.c:4022 [inline]
     balance_pgdat+0x702/0xd30 mm/vmscan.c:4213
     kswapd+0x200/0x340 mm/vmscan.c:4473
     kthread+0x2c7/0x2e0 kernel/kthread.c:327
     ret_from_fork+0x1f/0x30

    value changed: 0x01 -> 0x00

    Reported by Kernel Concurrency Sanitizer on:
    CPU: 0 PID: 71 Comm: kswapd0 Not tainted 5.16.0-rc1-syzkaller #0
    Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
    ==================================================================

[akpm@linux-foundation.org: tweak comments]

Link: https://lkml.kernel.org/r/20211201021104.126469-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reported-by: syzbot+aa5bebed695edaccf0df@syzkaller.appspotmail.com
Cc: Nadav Amit <namit@vmware.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6a89f128c990..e3b0476a4fda 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -647,7 +647,7 @@ struct mm_struct {
 		atomic_t tlb_flush_pending;
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 		/* See flush_tlb_batched_pending() */
-		bool tlb_flush_batched;
+		atomic_t tlb_flush_batched;
 #endif
 		struct uprobes_state uprobes_state;
 #ifdef CONFIG_PREEMPT_RT
-- 
cgit v1.2.3


From cab0a7c115546a4865fb7439558af9077a569574 Mon Sep 17 00:00:00 2001
From: Ting Liu <liuting.0x7c00@bytedance.com>
Date: Fri, 14 Jan 2022 14:09:28 -0800
Subject: mm: make some vars and functions static or __init

"page_idle_ops" as a global var, but its scope of use within this
document.  So it should be static.

"page_ext_ops" is a var used in the kernel initial phase.  And other
functions are aslo used in the kernel initial phase.  So they should be
__init or __initdata to reclaim memory.

Link: https://lkml.kernel.org/r/20211217095023.67293-1-liuting.0x7c00@bytedance.com
Signed-off-by: Ting Liu <liuting.0x7c00@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_idle.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 83abf95e9fa7..4663dfed1293 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -13,7 +13,6 @@
  * If there is not enough space to store Idle and Young bits in page flags, use
  * page ext flags instead.
  */
-extern struct page_ext_operations page_idle_ops;
 
 static inline bool folio_test_young(struct folio *folio)
 {
-- 
cgit v1.2.3


From cdeed009f3bceee41f73f0137db785fd29a05cb8 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 14 Jan 2022 14:09:44 -0800
Subject: mm/damon: remove some unneeded function definitions in damon.h

In damon.h some func definitions about VA & PA can only be used in its
own file, so there no need to define in the header file, and the header
file will look cleaner.

If other files later need these functions, the prototypes can be added
to damon.h at that time.

[sj@kernel.org: remove unnecessary function prototype position changes]
 Link: https://lkml.kernel.org/r/20211118114827.20052-1-sj@kernel.org

Link: https://lkml.kernel.org/r/45fd5b3ef6cce8e28dbc1c92f9dc845ccfc949d7.1636989871.git.xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index b4d4be3cc987..1d1be348f506 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -461,34 +461,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 #endif	/* CONFIG_DAMON */
 
 #ifdef CONFIG_DAMON_VADDR
-
-/* Monitoring primitives for virtual memory address spaces */
-void damon_va_init(struct damon_ctx *ctx);
-void damon_va_update(struct damon_ctx *ctx);
-void damon_va_prepare_access_checks(struct damon_ctx *ctx);
-unsigned int damon_va_check_accesses(struct damon_ctx *ctx);
 bool damon_va_target_valid(void *t);
-void damon_va_cleanup(struct damon_ctx *ctx);
-int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme);
-int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme);
 void damon_va_set_primitives(struct damon_ctx *ctx);
-
 #endif	/* CONFIG_DAMON_VADDR */
 
 #ifdef CONFIG_DAMON_PADDR
-
-/* Monitoring primitives for the physical memory address space */
-void damon_pa_prepare_access_checks(struct damon_ctx *ctx);
-unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
 bool damon_pa_target_valid(void *t);
-int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme);
-int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme);
 void damon_pa_set_primitives(struct damon_ctx *ctx);
-
 #endif	/* CONFIG_DAMON_PADDR */
 
 #endif	/* _DAMON_H */
-- 
cgit v1.2.3


From 9b2a38d6ef25c1748e3964b0ff30a89e4ed26583 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 14 Jan 2022 14:09:53 -0800
Subject: mm/damon: move damon_rand() definition into damon.h

damon_rand() is called in three files:damon/core.c, damon/ paddr.c,
damon/vaddr.c, i think there is no need to redefine this twice, So move
it to damon.h will be a good choice.

Link: https://lkml.kernel.org/r/20211202075859.51341-1-xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 1d1be348f506..3e91a597a1aa 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -11,12 +11,16 @@
 #include <linux/mutex.h>
 #include <linux/time64.h>
 #include <linux/types.h>
+#include <linux/random.h>
 
 /* Minimal region size.  Every damon_region is aligned by this. */
 #define DAMON_MIN_REGION	PAGE_SIZE
 /* Max priority score for DAMON-based operation schemes */
 #define DAMOS_MAX_SCORE		(99)
 
+/* Get a random number in [l, r) */
+#define damon_rand(l, r) (l + prandom_u32_max(r - l))
+
 /**
  * struct damon_addr_range - Represents an address region of [@start, @end).
  * @start:	Start address of the region (inclusive).
-- 
cgit v1.2.3


From 234d68732b6c135087bdebfa0630a43ae8c27758 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 14 Jan 2022 14:09:56 -0800
Subject: mm/damon: modify damon_rand() macro to static inline function

damon_rand() cannot be implemented as a macro.

Example:
	damon_rand(a++, b);

The value of 'a' will be incremented twice, This is obviously
unreasonable, So there fix it.

Link: https://lkml.kernel.org/r/110ffcd4e420c86c42b41ce2bc9f0fe6a4f32cd3.1638795127.git.xhao@linux.alibaba.com
Fixes: b9a6ac4e4ede ("mm/damon: adaptively adjust regions")
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 3e91a597a1aa..e2c8152985b7 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -19,7 +19,10 @@
 #define DAMOS_MAX_SCORE		(99)
 
 /* Get a random number in [l, r) */
-#define damon_rand(l, r) (l + prandom_u32_max(r - l))
+static inline unsigned long damon_rand(unsigned long l, unsigned long r)
+{
+	return l + prandom_u32_max(r - l);
+}
 
 /**
  * struct damon_addr_range - Represents an address region of [@start, @end).
-- 
cgit v1.2.3


From 88f86dcfa454784f7de550966c60fc78a3e95d6d Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:09:59 -0800
Subject: mm/damon: convert macro functions to static inline functions

Patch series "mm/damon: Misc cleanups".

This patchset contains miscellaneous cleanups for DAMON's macro
functions and documentation.

This patch (of 6):

This commit converts macro functions in DAMON to static inline functions,
for better type checking, code documentation, etc[1].

[1] https://lore.kernel.org/linux-mm/20211202151213.6ec830863342220da4141bc5@linux-foundation.org/

Link: https://lkml.kernel.org/r/20211209131806.19317-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211209131806.19317-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index e2c8152985b7..2dbc1f545da2 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -399,14 +399,20 @@ struct damon_ctx {
 	struct list_head schemes;
 };
 
-#define damon_next_region(r) \
-	(container_of(r->list.next, struct damon_region, list))
+static inline struct damon_region *damon_next_region(struct damon_region *r)
+{
+	return container_of(r->list.next, struct damon_region, list);
+}
 
-#define damon_prev_region(r) \
-	(container_of(r->list.prev, struct damon_region, list))
+static inline struct damon_region *damon_prev_region(struct damon_region *r)
+{
+	return container_of(r->list.prev, struct damon_region, list);
+}
 
-#define damon_last_region(t) \
-	(list_last_entry(&t->regions_list, struct damon_region, list))
+static inline struct damon_region *damon_last_region(struct damon_target *t)
+{
+	return list_last_entry(&t->regions_list, struct damon_region, list);
+}
 
 #define damon_for_each_region(r, t) \
 	list_for_each_entry(r, &t->regions_list, list)
-- 
cgit v1.2.3


From f4c6d22c6cf282ef7d24a724b9bd978ee2b74fc6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:14 -0800
Subject: mm/damon: remove a mistakenly added comment for a future feature

Due to a mistake in patches reordering, a comment for a future feature
called 'arbitrary monitoring target support'[1], which is still under
development, has added.  Because it only introduces confusion and we
don't have a plan to post the patches soon, this commit removes the
mistakenly added part.

[1] https://lore.kernel.org/linux-mm/20201215115448.25633-3-sjpark@amazon.com/

Link: https://lkml.kernel.org/r/20211209131806.19317-7-sj@kernel.org
Fixes: 1f366e421c8f ("mm/damon/core: implement DAMON-based Operation Schemes (DAMOS)")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 2dbc1f545da2..97f4a224e950 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -281,7 +281,7 @@ struct damon_ctx;
  * as an integer in [0, &DAMOS_MAX_SCORE].
  * @apply_scheme is called from @kdamond when a region for user provided
  * DAMON-based operation scheme is found.  It should apply the scheme's action
- * to the region.  This is not used for &DAMON_ARBITRARY_TARGET case.
+ * to the region.
  * @target_valid should check whether the target is still valid for the
  * monitoring.
  * @cleanup is called from @kdamond just before its termination.
-- 
cgit v1.2.3


From 0e92c2ee9f459542c5384d9cfab24873c3dd6398 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:17 -0800
Subject: mm/damon/schemes: account scheme actions that successfully applied

Patch series "mm/damon/schemes: Extend stats for better online analysis and tuning".

To help online access pattern analysis and tuning of DAMON-based
Operation Schemes (DAMOS), DAMOS provides simple statistics for each
scheme.  Introduction of DAMOS time/space quota further made the tuning
easier by making the risk management easier.  However, that also made
understanding of the working schemes a little bit more difficult.

For an example, progress of a given scheme can now be throttled by not
only the aggressiveness of the target access pattern, but also the
time/space quotas.  So, when a scheme is showing unexpectedly slow
progress, it's difficult to know by what the progress of the scheme is
throttled, with currently provided statistics.

This patchset extends the statistics to contain some metrics that can be
helpful for such online schemes analysis and tuning (patches 1-2),
exports those to users (patches 3 and 5), and add documents (patches 4
and 6).

This patch (of 6):

DAMON-based operation schemes (DAMOS) stats provide only the number and
the amount of regions that the action of the scheme has tried to be
applied.  Because the action could be failed for some reasons, the
currently provided information is sometimes not useful or convenient
enough for schemes profiling and tuning.  To improve this situation,
this commit extends the DAMOS stats to provide the number and the amount
of regions that the action has successfully applied.

Link: https://lkml.kernel.org/r/20211210150016.35349-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211210150016.35349-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 97f4a224e950..e0ad3d9aaeed 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -192,6 +192,20 @@ struct damos_watermarks {
 	bool activated;
 };
 
+/**
+ * struct damos_stat - Statistics on a given scheme.
+ * @nr_tried:	Total number of regions that the scheme is tried to be applied.
+ * @sz_tried:	Total size of regions that the scheme is tried to be applied.
+ * @nr_applied:	Total number of regions that the scheme is applied.
+ * @sz_applied:	Total size of regions that the scheme is applied.
+ */
+struct damos_stat {
+	unsigned long nr_tried;
+	unsigned long sz_tried;
+	unsigned long nr_applied;
+	unsigned long sz_applied;
+};
+
 /**
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @min_sz_region:	Minimum size of target regions.
@@ -203,8 +217,7 @@ struct damos_watermarks {
  * @action:		&damo_action to be applied to the target regions.
  * @quota:		Control the aggressiveness of this scheme.
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
- * @stat_count:		Total number of regions that this scheme is applied.
- * @stat_sz:		Total size of regions that this scheme is applied.
+ * @stat:		Statistics of this scheme.
  * @list:		List head for siblings.
  *
  * For each aggregation interval, DAMON finds regions which fit in the
@@ -235,8 +248,7 @@ struct damos {
 	enum damos_action action;
 	struct damos_quota quota;
 	struct damos_watermarks wmarks;
-	unsigned long stat_count;
-	unsigned long stat_sz;
+	struct damos_stat stat;
 	struct list_head list;
 };
 
@@ -281,7 +293,8 @@ struct damon_ctx;
  * as an integer in [0, &DAMOS_MAX_SCORE].
  * @apply_scheme is called from @kdamond when a region for user provided
  * DAMON-based operation scheme is found.  It should apply the scheme's action
- * to the region.
+ * to the region and return bytes of the region that the action is successfully
+ * applied.
  * @target_valid should check whether the target is still valid for the
  * monitoring.
  * @cleanup is called from @kdamond just before its termination.
@@ -295,8 +308,9 @@ struct damon_primitive {
 	int (*get_scheme_score)(struct damon_ctx *context,
 			struct damon_target *t, struct damon_region *r,
 			struct damos *scheme);
-	int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t,
-			struct damon_region *r, struct damos *scheme);
+	unsigned long (*apply_scheme)(struct damon_ctx *context,
+			struct damon_target *t, struct damon_region *r,
+			struct damos *scheme);
 	bool (*target_valid)(void *target);
 	void (*cleanup)(struct damon_ctx *context);
 };
-- 
cgit v1.2.3


From 6268eac34ca30af7f6313504d556ec7fcd295621 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:20 -0800
Subject: mm/damon/schemes: account how many times quota limit has exceeded

If the time/space quotas of a given DAMON-based operation scheme is too
small, the scheme could show unexpectedly slow progress.  However, there
is no good way to notice the case in runtime.  This commit extends the
DAMOS stat to provide how many times the quota limits exceeded so that
the users can easily notice the case and tune the scheme.

Link: https://lkml.kernel.org/r/20211210150016.35349-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index e0ad3d9aaeed..af648388e759 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -198,12 +198,14 @@ struct damos_watermarks {
  * @sz_tried:	Total size of regions that the scheme is tried to be applied.
  * @nr_applied:	Total number of regions that the scheme is applied.
  * @sz_applied:	Total size of regions that the scheme is applied.
+ * @qt_exceeds: Total number of times the quota of the scheme has exceeded.
  */
 struct damos_stat {
 	unsigned long nr_tried;
 	unsigned long sz_tried;
 	unsigned long nr_applied;
 	unsigned long sz_applied;
+	unsigned long qt_exceeds;
 };
 
 /**
-- 
cgit v1.2.3


From 2cd4b8e10cc31eadb5b10b1d73b3f28156f3776c Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@linux.dev>
Date: Fri, 14 Jan 2022 14:10:38 -0800
Subject: mm/damon: move the implementation of damon_insert_region to damon.h

Usually, inline function is declared static since it should sit between
storage and type.  And implement it in a header file if used by multiple
files.

And this change also fixes compile issue when backport damon to 5.10.

  mm/damon/vaddr.c: In function `damon_va_evenly_split_region':
  ./include/linux/damon.h:425:13: error: inlining failed in call to `always_inline' `damon_insert_region': function body not available
  425 | inline void damon_insert_region(struct damon_region *r,
      | ^~~~~~~~~~~~~~~~~~~
  mm/damon/vaddr.c:86:3: note: called from here
  86 | damon_insert_region(n, r, next, t);
     | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Link: https://lkml.kernel.org/r/20211223085703.6142-1-guoqing.jiang@linux.dev
Signed-off-by: Guoqing Jiang <guoqing.jiang@linux.dev>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index af648388e759..5e1e3a128b77 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -451,9 +451,18 @@ static inline struct damon_region *damon_last_region(struct damon_target *t)
 #ifdef CONFIG_DAMON
 
 struct damon_region *damon_new_region(unsigned long start, unsigned long end);
-inline void damon_insert_region(struct damon_region *r,
+
+/*
+ * Add a region between two other regions
+ */
+static inline void damon_insert_region(struct damon_region *r,
 		struct damon_region *prev, struct damon_region *next,
-		struct damon_target *t);
+		struct damon_target *t)
+{
+	__list_add(&r->list, &prev->list, &next->list);
+	t->nr_regions++;
+}
+
 void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 
-- 
cgit v1.2.3


From 47d8c15615c0a2046d2d90b04cb80b81ddf31fb1 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 14 Aug 2021 14:16:59 -0700
Subject: include: move find.h from asm_generic to linux

find_bit API and bitmap API are closely related, but inclusion paths
are different - include/asm-generic and include/linux, correspondingly.
In the past it made a lot of troubles due to circular dependencies
and/or undefined symbols. Fix this by moving find.h under include/linux.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 include/linux/bitmap.h |   1 +
 include/linux/find.h   | 268 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 269 insertions(+)
 create mode 100644 include/linux/find.h

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index a241dcf50f39..ead4a150bd7f 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -6,6 +6,7 @@
 
 #include <linux/align.h>
 #include <linux/bitops.h>
+#include <linux/find.h>
 #include <linux/limits.h>
 #include <linux/string.h>
 #include <linux/types.h>
diff --git a/include/linux/find.h b/include/linux/find.h
new file mode 100644
index 000000000000..c5410c243e04
--- /dev/null
+++ b/include/linux/find.h
@@ -0,0 +1,268 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_FIND_H_
+#define __LINUX_FIND_H_
+
+#ifndef __LINUX_BITMAP_H
+#error only <linux/bitmap.h> can be included directly
+#endif
+
+#include <linux/bitops.h>
+
+extern unsigned long _find_next_bit(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long nbits,
+		unsigned long start, unsigned long invert, unsigned long le);
+extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
+
+#ifndef find_next_bit
+/**
+ * find_next_bit - find the next set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val;
+
+		if (unlikely(offset >= size))
+			return size;
+
+		val = *addr & GENMASK(size - 1, offset);
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+}
+#endif
+
+#ifndef find_next_and_bit
+/**
+ * find_next_and_bit - find the next set bit in both memory regions
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_and_bit(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long size,
+		unsigned long offset)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val;
+
+		if (unlikely(offset >= size))
+			return size;
+
+		val = *addr1 & *addr2 & GENMASK(size - 1, offset);
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+}
+#endif
+
+#ifndef find_next_zero_bit
+/**
+ * find_next_zero_bit - find the next cleared bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number of the next zero bit
+ * If no bits are zero, returns @size.
+ */
+static inline
+unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+				 unsigned long offset)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val;
+
+		if (unlikely(offset >= size))
+			return size;
+
+		val = *addr | ~GENMASK(size - 1, offset);
+		return val == ~0UL ? size : ffz(val);
+	}
+
+	return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+}
+#endif
+
+#ifdef CONFIG_GENERIC_FIND_FIRST_BIT
+
+#ifndef find_first_bit
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum number of bits to search
+ *
+ * Returns the bit number of the first set bit.
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr & GENMASK(size - 1, 0);
+
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_first_bit(addr, size);
+}
+#endif
+
+#ifndef find_first_zero_bit
+/**
+ * find_first_zero_bit - find the first cleared bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum number of bits to search
+ *
+ * Returns the bit number of the first cleared bit.
+ * If no bits are zero, returns @size.
+ */
+static inline
+unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr | ~GENMASK(size - 1, 0);
+
+		return val == ~0UL ? size : ffz(val);
+	}
+
+	return _find_first_zero_bit(addr, size);
+}
+#endif
+
+#else /* CONFIG_GENERIC_FIND_FIRST_BIT */
+
+#ifndef find_first_bit
+#define find_first_bit(addr, size) find_next_bit((addr), (size), 0)
+#endif
+#ifndef find_first_zero_bit
+#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0)
+#endif
+
+#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
+
+#ifndef find_last_bit
+/**
+ * find_last_bit - find the last set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The number of bits to search
+ *
+ * Returns the bit number of the last set bit, or size.
+ */
+static inline
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr & GENMASK(size - 1, 0);
+
+		return val ? __fls(val) : size;
+	}
+
+	return _find_last_bit(addr, size);
+}
+#endif
+
+/**
+ * find_next_clump8 - find next 8-bit clump with set bits in a memory region
+ * @clump: location to store copy of found clump
+ * @addr: address to base the search on
+ * @size: bitmap size in number of bits
+ * @offset: bit offset at which to start searching
+ *
+ * Returns the bit offset for the next set clump; the found clump value is
+ * copied to the location pointed by @clump. If no bits are set, returns @size.
+ */
+extern unsigned long find_next_clump8(unsigned long *clump,
+				      const unsigned long *addr,
+				      unsigned long size, unsigned long offset);
+
+#define find_first_clump8(clump, bits, size) \
+	find_next_clump8((clump), (bits), (size), 0)
+
+#if defined(__LITTLE_ENDIAN)
+
+static inline unsigned long find_next_zero_bit_le(const void *addr,
+		unsigned long size, unsigned long offset)
+{
+	return find_next_zero_bit(addr, size, offset);
+}
+
+static inline unsigned long find_next_bit_le(const void *addr,
+		unsigned long size, unsigned long offset)
+{
+	return find_next_bit(addr, size, offset);
+}
+
+static inline unsigned long find_first_zero_bit_le(const void *addr,
+		unsigned long size)
+{
+	return find_first_zero_bit(addr, size);
+}
+
+#elif defined(__BIG_ENDIAN)
+
+#ifndef find_next_zero_bit_le
+static inline
+unsigned long find_next_zero_bit_le(const void *addr, unsigned
+		long size, unsigned long offset)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *(const unsigned long *)addr;
+
+		if (unlikely(offset >= size))
+			return size;
+
+		val = swab(val) | ~GENMASK(size - 1, offset);
+		return val == ~0UL ? size : ffz(val);
+	}
+
+	return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
+}
+#endif
+
+#ifndef find_next_bit_le
+static inline
+unsigned long find_next_bit_le(const void *addr, unsigned
+		long size, unsigned long offset)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *(const unsigned long *)addr;
+
+		if (unlikely(offset >= size))
+			return size;
+
+		val = swab(val) & GENMASK(size - 1, offset);
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
+}
+#endif
+
+#ifndef find_first_zero_bit_le
+#define find_first_zero_bit_le(addr, size) \
+	find_next_zero_bit_le((addr), (size), 0)
+#endif
+
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+
+#endif /*__LINUX_FIND_H_ */
-- 
cgit v1.2.3


From c126a53c276048125b4a950072bab37ad0fea120 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 14 Aug 2021 14:17:00 -0700
Subject: arch: remove GENERIC_FIND_FIRST_BIT entirely

In 5.12 cycle we enabled GENERIC_FIND_FIRST_BIT config option for ARM64
and MIPS. It increased performance and shrunk .text size; and so far
I didn't receive any negative feedback on the change.

https://lore.kernel.org/linux-arch/20210225135700.1381396-1-yury.norov@gmail.com/

Now I think it's a good time to switch all architectures to use
find_{first,last}_bit() unconditionally, and so remove corresponding
config option.

The patch does't introduce functioal changes for arc, arm, arm64, mips,
m68k, s390 and x86, for other architectures I expect improvement both in
performance and .text size.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Tested-by: Alexander Lobakin <alobakin@pm.me> (mips)
Reviewed-by: Alexander Lobakin <alobakin@pm.me> (mips)
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Will Deacon <will@kernel.org>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 include/linux/find.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/find.h b/include/linux/find.h
index c5410c243e04..ea57f7f38c49 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -101,8 +101,6 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
 }
 #endif
 
-#ifdef CONFIG_GENERIC_FIND_FIRST_BIT
-
 #ifndef find_first_bit
 /**
  * find_first_bit - find the first set bit in a memory region
@@ -147,17 +145,6 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
 }
 #endif
 
-#else /* CONFIG_GENERIC_FIND_FIRST_BIT */
-
-#ifndef find_first_bit
-#define find_first_bit(addr, size) find_next_bit((addr), (size), 0)
-#endif
-#ifndef find_first_zero_bit
-#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0)
-#endif
-
-#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
-
 #ifndef find_last_bit
 /**
  * find_last_bit - find the last set bit in a memory region
-- 
cgit v1.2.3


From f68edc9297bf3f7c94abb54b9b0b053607f7587b Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 14 Aug 2021 14:17:01 -0700
Subject: lib: add find_first_and_bit()

Currently find_first_and_bit() is an alias to find_next_and_bit(). However,
it is widely used in cpumask, so it worth to optimize it. This patch adds
its own implementation for find_first_and_bit().

On x86_64 find_bit_benchmark says:

Before (#define find_first_and_bit(...) find_next_and_bit(..., 0):
Start testing find_bit() with random-filled bitmap
[  140.291468] find_first_and_bit:           46890919 ns,  32671 iterations
Start testing find_bit() with sparse bitmap
[  140.295028] find_first_and_bit:               7103 ns,      1 iterations

After:
Start testing find_bit() with random-filled bitmap
[  162.574907] find_first_and_bit:           25045813 ns,  32846 iterations
Start testing find_bit() with sparse bitmap
[  162.578458] find_first_and_bit:               4900 ns,      1 iterations

(Thanks to Alexey Klimov for thorough testing.)

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Tested-by: Alexey Klimov <aklimov@redhat.com>
---
 include/linux/find.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/find.h b/include/linux/find.h
index ea57f7f38c49..6048f8c97418 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -12,6 +12,8 @@ extern unsigned long _find_next_bit(const unsigned long *addr1,
 		const unsigned long *addr2, unsigned long nbits,
 		unsigned long start, unsigned long invert, unsigned long le);
 extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_first_and_bit(const unsigned long *addr1,
+					 const unsigned long *addr2, unsigned long size);
 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
 extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
 
@@ -123,6 +125,31 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
 }
 #endif
 
+#ifndef find_first_and_bit
+/**
+ * find_first_and_bit - find the first set bit in both memory regions
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_first_and_bit(const unsigned long *addr1,
+				 const unsigned long *addr2,
+				 unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0);
+
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_first_and_bit(addr1, addr2, size);
+}
+#endif
+
 #ifndef find_first_zero_bit
 /**
  * find_first_zero_bit - find the first cleared bit in a memory region
-- 
cgit v1.2.3


From 93ba139ba8190c33009c5353ca43c8519443f467 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 14 Aug 2021 14:17:02 -0700
Subject: cpumask: use find_first_and_bit()

Now we have an efficient implementation for find_first_and_bit(),
so switch cpumask to use it where appropriate.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 include/linux/cpumask.h | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 1e7399fc69c0..c4e1b9ea0ba4 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -123,6 +123,12 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return 0;
 }
 
+static inline unsigned int cpumask_first_and(const struct cpumask *srcp1,
+					     const struct cpumask *srcp2)
+{
+	return 0;
+}
+
 static inline unsigned int cpumask_last(const struct cpumask *srcp)
 {
 	return 0;
@@ -167,7 +173,7 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
 
 static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
 					     const struct cpumask *src2p) {
-	return cpumask_next_and(-1, src1p, src2p);
+	return cpumask_first_and(src1p, src2p);
 }
 
 static inline int cpumask_any_distribute(const struct cpumask *srcp)
@@ -195,6 +201,19 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
+ * @src1p: the first input
+ * @src2p: the second input
+ *
+ * Returns >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
+ */
+static inline
+unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
+{
+	return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
+}
+
 /**
  * cpumask_last - get the last CPU in a cpumask
  * @srcp:	- the cpumask pointer
@@ -585,15 +604,6 @@ static inline void cpumask_copy(struct cpumask *dstp,
  */
 #define cpumask_any(srcp) cpumask_first(srcp)
 
-/**
- * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
- * @src1p: the first input
- * @src2p: the second input
- *
- * Returns >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
- */
-#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p))
-
 /**
  * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2
  * @mask1: the first input cpumask
-- 
cgit v1.2.3


From 9b51d9d866482a703646fd4c07e433c3d9d88efd Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 14 Aug 2021 14:17:05 -0700
Subject: cpumask: replace cpumask_next_* with cpumask_first_* where
 appropriate

cpumask_first() is a more effective analogue of 'next' version if n == -1
(which means start == 0). This patch replaces 'next' with 'first' where
things look trivial.

There's no cpumask_first_zero() function, so create it.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 include/linux/cpumask.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c4e1b9ea0ba4..64dae70d31f5 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -123,6 +123,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return 0;
 }
 
+static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
+{
+	return 0;
+}
+
 static inline unsigned int cpumask_first_and(const struct cpumask *srcp1,
 					     const struct cpumask *srcp2)
 {
@@ -201,6 +206,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_first_zero - get the first unset cpu in a cpumask
+ * @srcp: the cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if all cpus are set.
+ */
+static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
+{
+	return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
 /**
  * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
  * @src1p: the first input
-- 
cgit v1.2.3


From bc9d6635c293a2ac30c6319f7cfd08860ab7948a Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 14 Aug 2021 14:17:06 -0700
Subject: include/linux: move for_each_bit() macros from bitops.h to find.h

for_each_bit() macros depend on find_bit() machinery, and so the
proper place for them is the find.h header.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 include/linux/bitops.h | 34 ----------------------------------
 include/linux/find.h   | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 5e62e2383b7f..7aaed501f768 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -32,40 +32,6 @@ extern unsigned long __sw_hweight64(__u64 w);
  */
 #include <asm/bitops.h>
 
-#define for_each_set_bit(bit, addr, size) \
-	for ((bit) = find_first_bit((addr), (size));		\
-	     (bit) < (size);					\
-	     (bit) = find_next_bit((addr), (size), (bit) + 1))
-
-/* same as for_each_set_bit() but use bit as value to start with */
-#define for_each_set_bit_from(bit, addr, size) \
-	for ((bit) = find_next_bit((addr), (size), (bit));	\
-	     (bit) < (size);					\
-	     (bit) = find_next_bit((addr), (size), (bit) + 1))
-
-#define for_each_clear_bit(bit, addr, size) \
-	for ((bit) = find_first_zero_bit((addr), (size));	\
-	     (bit) < (size);					\
-	     (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
-
-/* same as for_each_clear_bit() but use bit as value to start with */
-#define for_each_clear_bit_from(bit, addr, size) \
-	for ((bit) = find_next_zero_bit((addr), (size), (bit));	\
-	     (bit) < (size);					\
-	     (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
-
-/**
- * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
- * @start: bit offset to start search and to store the current iteration offset
- * @clump: location to store copy of current 8-bit clump
- * @bits: bitmap address to base the search on
- * @size: bitmap size in number of bits
- */
-#define for_each_set_clump8(start, clump, bits, size) \
-	for ((start) = find_first_clump8(&(clump), (bits), (size)); \
-	     (start) < (size); \
-	     (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8))
-
 static inline int get_bitmask_order(unsigned int count)
 {
 	int order;
diff --git a/include/linux/find.h b/include/linux/find.h
index 6048f8c97418..4500e8ab93e2 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -279,4 +279,38 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 #error "Please fix <asm/byteorder.h>"
 #endif
 
+#define for_each_set_bit(bit, addr, size) \
+	for ((bit) = find_first_bit((addr), (size));		\
+	     (bit) < (size);					\
+	     (bit) = find_next_bit((addr), (size), (bit) + 1))
+
+/* same as for_each_set_bit() but use bit as value to start with */
+#define for_each_set_bit_from(bit, addr, size) \
+	for ((bit) = find_next_bit((addr), (size), (bit));	\
+	     (bit) < (size);					\
+	     (bit) = find_next_bit((addr), (size), (bit) + 1))
+
+#define for_each_clear_bit(bit, addr, size) \
+	for ((bit) = find_first_zero_bit((addr), (size));	\
+	     (bit) < (size);					\
+	     (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
+
+/* same as for_each_clear_bit() but use bit as value to start with */
+#define for_each_clear_bit_from(bit, addr, size) \
+	for ((bit) = find_next_zero_bit((addr), (size), (bit));	\
+	     (bit) < (size);					\
+	     (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
+
+/**
+ * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
+ * @start: bit offset to start search and to store the current iteration offset
+ * @clump: location to store copy of current 8-bit clump
+ * @bits: bitmap address to base the search on
+ * @size: bitmap size in number of bits
+ */
+#define for_each_set_clump8(start, clump, bits, size) \
+	for ((start) = find_first_clump8(&(clump), (bits), (size)); \
+	     (start) < (size); \
+	     (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8))
+
 #endif /*__LINUX_FIND_H_ */
-- 
cgit v1.2.3


From 7516be9931b8bc8bcaac8531f490b42ab11ded1e Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 14 Aug 2021 14:17:07 -0700
Subject: find: micro-optimize for_each_{set,clear}_bit()

The macros iterate thru all set/clear bits in a bitmap. They search a
first bit using find_first_bit(), and the rest bits using find_next_bit().

Since find_next_bit() is called shortly after find_first_bit(), we can
save few lines of I-cache by not using find_first_bit().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 include/linux/find.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/find.h b/include/linux/find.h
index 4500e8ab93e2..ae9ed52b52b8 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -280,7 +280,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 #endif
 
 #define for_each_set_bit(bit, addr, size) \
-	for ((bit) = find_first_bit((addr), (size));		\
+	for ((bit) = find_next_bit((addr), (size), 0);		\
 	     (bit) < (size);					\
 	     (bit) = find_next_bit((addr), (size), (bit) + 1))
 
@@ -291,7 +291,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 	     (bit) = find_next_bit((addr), (size), (bit) + 1))
 
 #define for_each_clear_bit(bit, addr, size) \
-	for ((bit) = find_first_zero_bit((addr), (size));	\
+	for ((bit) = find_next_zero_bit((addr), (size), 0);	\
 	     (bit) < (size);					\
 	     (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
 
-- 
cgit v1.2.3


From ec288a2cf7ca40a939316b6df206ab845bb112d1 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 14 Aug 2021 14:17:11 -0700
Subject: bitmap: unify find_bit operations

bitmap_for_each_{set,clear}_region() are similar to for_each_bit()
macros in include/linux/find.h, but interface and implementation
of them are different.

This patch adds for_each_bitrange() macros and drops unused
bitmap_*_region() API in sake of unification.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Acked-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # For MMC
---
 include/linux/bitmap.h | 33 -----------------------------
 include/linux/find.h   | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index ead4a150bd7f..7dba0847510c 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -55,12 +55,6 @@ struct device;
  *  bitmap_clear(dst, pos, nbits)               Clear specified bit area
  *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
  *  bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off)  as above
- *  bitmap_next_clear_region(map, &start, &end, nbits)  Find next clear region
- *  bitmap_next_set_region(map, &start, &end, nbits)  Find next set region
- *  bitmap_for_each_clear_region(map, rs, re, start, end)
- *  						Iterate over all clear regions
- *  bitmap_for_each_set_region(map, rs, re, start, end)
- *  						Iterate over all set regions
  *  bitmap_shift_right(dst, src, n, nbits)      *dst = *src >> n
  *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
  *  bitmap_cut(dst, src, first, n, nbits)       Cut n bits from first, copy rest
@@ -467,14 +461,6 @@ static inline void bitmap_replace(unsigned long *dst,
 		__bitmap_replace(dst, old, new, mask, nbits);
 }
 
-static inline void bitmap_next_clear_region(unsigned long *bitmap,
-					    unsigned int *rs, unsigned int *re,
-					    unsigned int end)
-{
-	*rs = find_next_zero_bit(bitmap, end, *rs);
-	*re = find_next_bit(bitmap, end, *rs + 1);
-}
-
 static inline void bitmap_next_set_region(unsigned long *bitmap,
 					  unsigned int *rs, unsigned int *re,
 					  unsigned int end)
@@ -483,25 +469,6 @@ static inline void bitmap_next_set_region(unsigned long *bitmap,
 	*re = find_next_zero_bit(bitmap, end, *rs + 1);
 }
 
-/*
- * Bitmap region iterators.  Iterates over the bitmap between [@start, @end).
- * @rs and @re should be integer variables and will be set to start and end
- * index of the current clear or set region.
- */
-#define bitmap_for_each_clear_region(bitmap, rs, re, start, end)	     \
-	for ((rs) = (start),						     \
-	     bitmap_next_clear_region((bitmap), &(rs), &(re), (end));	     \
-	     (rs) < (re);						     \
-	     (rs) = (re) + 1,						     \
-	     bitmap_next_clear_region((bitmap), &(rs), &(re), (end)))
-
-#define bitmap_for_each_set_region(bitmap, rs, re, start, end)		     \
-	for ((rs) = (start),						     \
-	     bitmap_next_set_region((bitmap), &(rs), &(re), (end));	     \
-	     (rs) < (re);						     \
-	     (rs) = (re) + 1,						     \
-	     bitmap_next_set_region((bitmap), &(rs), &(re), (end)))
-
 /**
  * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
  * @n: u64 value
diff --git a/include/linux/find.h b/include/linux/find.h
index ae9ed52b52b8..5bb6db213bcb 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -301,6 +301,62 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 	     (bit) < (size);					\
 	     (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
 
+/**
+ * for_each_set_bitrange - iterate over all set bit ranges [b; e)
+ * @b: bit offset of start of current bitrange (first set bit)
+ * @e: bit offset of end of current bitrange (first unset bit)
+ * @addr: bitmap address to base the search on
+ * @size: bitmap size in number of bits
+ */
+#define for_each_set_bitrange(b, e, addr, size)			\
+	for ((b) = find_next_bit((addr), (size), 0),		\
+	     (e) = find_next_zero_bit((addr), (size), (b) + 1);	\
+	     (b) < (size);					\
+	     (b) = find_next_bit((addr), (size), (e) + 1),	\
+	     (e) = find_next_zero_bit((addr), (size), (b) + 1))
+
+/**
+ * for_each_set_bitrange_from - iterate over all set bit ranges [b; e)
+ * @b: bit offset of start of current bitrange (first set bit); must be initialized
+ * @e: bit offset of end of current bitrange (first unset bit)
+ * @addr: bitmap address to base the search on
+ * @size: bitmap size in number of bits
+ */
+#define for_each_set_bitrange_from(b, e, addr, size)		\
+	for ((b) = find_next_bit((addr), (size), (b)),		\
+	     (e) = find_next_zero_bit((addr), (size), (b) + 1);	\
+	     (b) < (size);					\
+	     (b) = find_next_bit((addr), (size), (e) + 1),	\
+	     (e) = find_next_zero_bit((addr), (size), (b) + 1))
+
+/**
+ * for_each_clear_bitrange - iterate over all unset bit ranges [b; e)
+ * @b: bit offset of start of current bitrange (first unset bit)
+ * @e: bit offset of end of current bitrange (first set bit)
+ * @addr: bitmap address to base the search on
+ * @size: bitmap size in number of bits
+ */
+#define for_each_clear_bitrange(b, e, addr, size)		\
+	for ((b) = find_next_zero_bit((addr), (size), 0),	\
+	     (e) = find_next_bit((addr), (size), (b) + 1);	\
+	     (b) < (size);					\
+	     (b) = find_next_zero_bit((addr), (size), (e) + 1),	\
+	     (e) = find_next_bit((addr), (size), (b) + 1))
+
+/**
+ * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e)
+ * @b: bit offset of start of current bitrange (first set bit); must be initialized
+ * @e: bit offset of end of current bitrange (first unset bit)
+ * @addr: bitmap address to base the search on
+ * @size: bitmap size in number of bits
+ */
+#define for_each_clear_bitrange_from(b, e, addr, size)		\
+	for ((b) = find_next_zero_bit((addr), (size), (b)),	\
+	     (e) = find_next_bit((addr), (size), (b) + 1);	\
+	     (b) < (size);					\
+	     (b) = find_next_zero_bit((addr), (size), (e) + 1),	\
+	     (e) = find_next_bit((addr), (size), (b) + 1))
+
 /**
  * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
  * @start: bit offset to start search and to store the current iteration offset
-- 
cgit v1.2.3


From 7372971c1be5b7d4fdd8ad237798bdc1d1d54162 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 11 Jan 2022 10:19:22 +0300
Subject: rtc: mc146818-lib: fix signedness bug in mc146818_get_time()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The mc146818_get_time() function returns zero on success or negative
a error code on failure.  It needs to be type int.

Fixes: d35786b3a28d ("rtc: mc146818-lib: change return values of mc146818_get_time()")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20220111071922.GE11243@kili
---
 include/linux/mc146818rtc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h
index 67fb0a12becc..808bb4cee230 100644
--- a/include/linux/mc146818rtc.h
+++ b/include/linux/mc146818rtc.h
@@ -124,7 +124,7 @@ struct cmos_rtc_board_info {
 #endif /* ARCH_RTC_LOCATION */
 
 bool mc146818_does_rtc_work(void);
-unsigned int mc146818_get_time(struct rtc_time *time);
+int mc146818_get_time(struct rtc_time *time);
 int mc146818_set_time(struct rtc_time *time);
 
 bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param),
-- 
cgit v1.2.3


From 3fe7fa5843d204e235d92902190fecb972a3f9cc Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 10 Dec 2021 15:09:21 -0500
Subject: mm: Add folio_put_refs()

This is like folio_put(), but puts N references at once instead of
just one.  It's like put_page_refs(), but does one atomic operation
instead of two, and is available to more than just gup.c.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/mm.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c768a7c81b0b..cb98f75b245e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1244,6 +1244,26 @@ static inline void folio_put(struct folio *folio)
 		__put_page(&folio->page);
 }
 
+/**
+ * folio_put_refs - Reduce the reference count on a folio.
+ * @folio: The folio.
+ * @refs: The amount to subtract from the folio's reference count.
+ *
+ * If the folio's reference count reaches zero, the memory will be
+ * released back to the page allocator and may be used by another
+ * allocation immediately.  Do not access the memory or the struct folio
+ * after calling folio_put_refs() unless you can be sure that these weren't
+ * the last references.
+ *
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context.  May be called while holding a spinlock.
+ */
+static inline void folio_put_refs(struct folio *folio, int refs)
+{
+	if (folio_ref_sub_and_test(folio, refs))
+		__put_page(&folio->page);
+}
+
 static inline void put_page(struct page *page)
 {
 	struct folio *folio = page_folio(page);
-- 
cgit v1.2.3


From a6097180d884ddab769fb25588ea8598589c218c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 17 Jan 2022 09:07:26 +1100
Subject: devtmpfs regression fix: reconfigure on each mount

Prior to Linux v5.4 devtmpfs used mount_single() which treats the given
mount options as "remount" options, so it updates the configuration of
the single super_block on each mount.

Since that was changed, the mount options used for devtmpfs are ignored.
This is a regression which affect systemd - which mounts devtmpfs with
"-o mode=755,size=4m,nr_inodes=1m".

This patch restores the "remount" effect by calling reconfigure_single()

Fixes: d401727ea0d7 ("devtmpfs: don't mix {ramfs,shmem}_fill_super() with mount_single()")
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs_context.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 6b54982fc5f3..13fa6f3df8e4 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -142,6 +142,8 @@ extern void put_fs_context(struct fs_context *fc);
 extern int vfs_parse_fs_param_source(struct fs_context *fc,
 				     struct fs_parameter *param);
 extern void fc_drop_locked(struct fs_context *fc);
+int reconfigure_single(struct super_block *s,
+		       int flags, void *data);
 
 /*
  * sget() wrappers to be called from the ->get_tree() op.
-- 
cgit v1.2.3


From 09f5e7dc7ad705289e1b1ec065439aa3c42951c4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 20 Dec 2021 13:19:52 +0100
Subject: perf: Fix perf_event_read_local() time

Time readers that cannot take locks (due to NMI etc..) currently make
use of perf_event::shadow_ctx_time, which, for that event gives:

  time' = now + (time - timestamp)

or, alternatively arranged:

  time' = time + (now - timestamp)

IOW, the progression of time since the last time the shadow_ctx_time
was updated.

There's problems with this:

 A) the shadow_ctx_time is per-event, even though the ctx_time it
    reflects is obviously per context. The direct concequence of this
    is that the context needs to iterate all events all the time to
    keep the shadow_ctx_time in sync.

 B) even with the prior point, the context itself might not be active
    meaning its time should not advance to begin with.

 C) shadow_ctx_time isn't consistently updated when ctx_time is

There are 3 users of this stuff, that suffer differently from this:

 - calc_timer_values()
   - perf_output_read()
   - perf_event_update_userpage()	/* A */

 - perf_event_read_local()		/* A,B */

In particular, perf_output_read() doesn't suffer at all, because it's
sample driven and hence only relevant when the event is actually
running.

This same was supposed to be true for perf_event_update_userpage(),
after all self-monitoring implies the context is active *HOWEVER*, as
per commit f79256532682 ("perf/core: fix userpage->time_enabled of
inactive events") this goes wrong when combined with counter
overcommit, in that case those events that do not get scheduled when
the context becomes active (task events typically) miss out on the
EVENT_TIME update and ENABLED time is inflated (for a little while)
with the time the context was inactive. Once the event gets rotated
in, this gets corrected, leading to a non-monotonic timeflow.

perf_event_read_local() made things even worse, it can request time at
any point, suffering all the problems perf_event_update_userpage()
does and more. Because while perf_event_update_userpage() is limited
by the context being active, perf_event_read_local() users have no
such constraint.

Therefore, completely overhaul things and do away with
perf_event::shadow_ctx_time. Instead have regular context time updates
keep track of this offset directly and provide perf_event_time_now()
to complement perf_event_time().

perf_event_time_now() will, in adition to being context wide, also
take into account if the context is active. For inactive context, it
will not advance time.

This latter property means the cgroup perf_cgroup_info context needs
to grow addition state to track this.

Additionally, since all this is strictly per-cpu, we can use barrier()
to order context activity vs context time.

Fixes: 7d9285e82db5 ("perf/bpf: Extend the perf_event_read_local() interface, a.k.a. "bpf: perf event change needed for subsequent bpf helpers"")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Song Liu <song@kernel.org>
Tested-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lkml.kernel.org/r/YcB06DasOBtU0b00@hirez.programming.kicks-ass.net
---
 include/linux/perf_event.h | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 117f230bcdfd..733649184b27 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -693,18 +693,6 @@ struct perf_event {
 	u64				total_time_running;
 	u64				tstamp;
 
-	/*
-	 * timestamp shadows the actual context timing but it can
-	 * be safely used in NMI interrupt context. It reflects the
-	 * context time as it was when the event was last scheduled in,
-	 * or when ctx_sched_in failed to schedule the event because we
-	 * run out of PMC.
-	 *
-	 * ctx_time already accounts for ctx->timestamp. Therefore to
-	 * compute ctx_time for a sample, simply add perf_clock().
-	 */
-	u64				shadow_ctx_time;
-
 	struct perf_event_attr		attr;
 	u16				header_size;
 	u16				id_header_size;
@@ -852,6 +840,7 @@ struct perf_event_context {
 	 */
 	u64				time;
 	u64				timestamp;
+	u64				timeoffset;
 
 	/*
 	 * These fields let us detect when two contexts have both
@@ -934,6 +923,8 @@ struct bpf_perf_event_data_kern {
 struct perf_cgroup_info {
 	u64				time;
 	u64				timestamp;
+	u64				timeoffset;
+	int				active;
 };
 
 struct perf_cgroup {
-- 
cgit v1.2.3


From a06247c6804f1a7c86a2e5398a4c1f1db1471848 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 11 Jan 2022 15:23:09 -0800
Subject: psi: Fix uaf issue when psi trigger is destroyed while being polled

With write operation on psi files replacing old trigger with a new one,
the lifetime of its waitqueue is totally arbitrary. Overwriting an
existing trigger causes its waitqueue to be freed and pending poll()
will stumble on trigger->event_wait which was destroyed.
Fix this by disallowing to redefine an existing psi trigger. If a write
operation is used on a file descriptor with an already existing psi
trigger, the operation will fail with EBUSY error.
Also bypass a check for psi_disabled in the psi_trigger_destroy as the
flag can be flipped after the trigger is created, leading to a memory
leak.

Fixes: 0e94682b73bf ("psi: introduce psi monitor")
Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Analyzed-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20220111232309.1786347-1-surenb@google.com
---
 include/linux/psi.h       | 2 +-
 include/linux/psi_types.h | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/psi.h b/include/linux/psi.h
index a70ca833c6d7..f8ce53bfdb2a 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -33,7 +33,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to);
 
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
 			char *buf, size_t nbytes, enum psi_res res);
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t);
+void psi_trigger_destroy(struct psi_trigger *t);
 
 __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
 			poll_table *wait);
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 516c0fe836fd..1a3cef26d129 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -141,9 +141,6 @@ struct psi_trigger {
 	 * events to one per window
 	 */
 	u64 last_event_time;
-
-	/* Refcounting to prevent premature destruction */
-	struct kref refcount;
 };
 
 struct psi_group {
-- 
cgit v1.2.3


From 0e3872499de1a1230cef5221607d71aa09264bd5 Mon Sep 17 00:00:00 2001
From: Hui Su <suhui_kernel@163.com>
Date: Fri, 7 Jan 2022 17:52:54 +0800
Subject: kernel/sched: Remove dl_boosted flag comment

since commit 2279f540ea7d ("sched/deadline: Fix priority
inheritance with multiple scheduling classes"), we should not
keep it here.

Signed-off-by: Hui Su <suhui_kernel@163.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lore.kernel.org/r/20220107095254.GA49258@localhost.localdomain
---
 include/linux/sched.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7a1f16df66e3..7f8b4495e243 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -614,10 +614,6 @@ struct sched_dl_entity {
 	 * task has to wait for a replenishment to be performed at the
 	 * next firing of dl_timer.
 	 *
-	 * @dl_boosted tells if we are boosted due to DI. If so we are
-	 * outside bandwidth enforcement mechanism (but only until we
-	 * exit the critical section);
-	 *
 	 * @dl_yielded tells if task gave up the CPU before consuming
 	 * all its available runtime during the last job.
 	 *
-- 
cgit v1.2.3


From be80a1d3f9dbe5aee79a325964f7037fe2d92f30 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 10 Jan 2022 14:05:49 +0000
Subject: bpf: Generalize check_ctx_reg for reuse with other types

Generalize the check_ctx_reg() helper function into a more generic named one
so that it can be reused for other register types as well to check whether
their offset is non-zero. No functional change.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 143401d4c9d9..e9993172f892 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -519,8 +519,8 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
 void
 bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);
 
-int check_ctx_reg(struct bpf_verifier_env *env,
-		  const struct bpf_reg_state *reg, int regno);
+int check_ptr_off_reg(struct bpf_verifier_env *env,
+		      const struct bpf_reg_state *reg, int regno);
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size);
 
-- 
cgit v1.2.3


From a672b2e36a648afb04ad3bda93b6bda947a479a5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 13 Jan 2022 11:11:30 +0000
Subject: bpf: Fix ringbuf memory type confusion when passing to helpers

The bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM
in their bpf_func_proto definition as their first argument, and thus both expect
the result from a prior bpf_ringbuf_reserve() call which has a return type of
RET_PTR_TO_ALLOC_MEM_OR_NULL.

While the non-NULL memory from bpf_ringbuf_reserve() can be passed to other
helpers, the two sinks (bpf_ringbuf_submit(), bpf_ringbuf_discard()) right now
only enforce a register type of PTR_TO_MEM.

This can lead to potential type confusion since it would allow other PTR_TO_MEM
memory to be passed into the two sinks which did not come from bpf_ringbuf_reserve().

Add a new MEM_ALLOC composable type attribute for PTR_TO_MEM, and enforce that:

 - bpf_ringbuf_reserve() returns NULL or PTR_TO_MEM | MEM_ALLOC
 - bpf_ringbuf_submit() and bpf_ringbuf_discard() only take PTR_TO_MEM | MEM_ALLOC
   but not plain PTR_TO_MEM arguments via ARG_PTR_TO_ALLOC_MEM
 - however, other helpers might treat PTR_TO_MEM | MEM_ALLOC as plain PTR_TO_MEM
   to populate the memory area when they use ARG_PTR_TO_{UNINIT_,}MEM in their
   func proto description

Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6e947cd91152..fa517ae604ad 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -316,7 +316,12 @@ enum bpf_type_flag {
 	 */
 	MEM_RDONLY		= BIT(1 + BPF_BASE_TYPE_BITS),
 
-	__BPF_TYPE_LAST_FLAG	= MEM_RDONLY,
+	/* MEM was "allocated" from a different helper, and cannot be mixed
+	 * with regular non-MEM_ALLOC'ed MEM types.
+	 */
+	MEM_ALLOC		= BIT(2 + BPF_BASE_TYPE_BITS),
+
+	__BPF_TYPE_LAST_FLAG	= MEM_ALLOC,
 };
 
 /* Max number of base types. */
@@ -400,7 +405,7 @@ enum bpf_return_type {
 	RET_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
 	RET_PTR_TO_TCP_SOCK_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
 	RET_PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
-	RET_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM,
+	RET_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM,
 	RET_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
 
 	/* This must be the last entry. Its purpose is to ensure the enum is
-- 
cgit v1.2.3


From e6eec09b7bc7869a49ac0ff376415bad40030ade Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 8 Dec 2021 01:52:15 +0000
Subject: KVM: Drop unused kvm_vcpu.pre_pcpu field

Remove kvm_vcpu.pre_pcpu as it no longer has any users.  No functional
change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20211208015236.1616697-6-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3c47b146851a..5c3c67b6318f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -309,7 +309,6 @@ struct kvm_vcpu {
 	u64 requests;
 	unsigned long guest_debug;
 
-	int pre_pcpu;
 	struct list_head blocked_vcpu_list;
 
 	struct mutex mutex;
-- 
cgit v1.2.3


From 12a8eee5686ef3ea7d8db90cd664f11e4a39e349 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 8 Dec 2021 01:52:16 +0000
Subject: KVM: Move x86 VMX's posted interrupt list_head to vcpu_vmx

Move the seemingly generic block_vcpu_list from kvm_vcpu to vcpu_vmx, and
rename the list and all associated variables to clarify that it tracks
the set of vCPU that need to be poked on a posted interrupt to the wakeup
vector.  The list is not used to track _all_ vCPUs that are blocking, and
the term "blocked" can be misleading as it may refer to a blocking
condition in the host or the guest, where as the PI wakeup case is
specifically for the vCPUs that are actively blocking from within the
guest.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20211208015236.1616697-7-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5c3c67b6318f..f079820f52b5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -309,8 +309,6 @@ struct kvm_vcpu {
 	u64 requests;
 	unsigned long guest_debug;
 
-	struct list_head blocked_vcpu_list;
-
 	struct mutex mutex;
 	struct kvm_run *run;
 
-- 
cgit v1.2.3


From 1ca3fb3abd2b615c4b61728de545760a6e2c2d8b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 19 Jan 2022 18:07:45 -0800
Subject: mm: percpu: add pcpu_fc_cpu_to_node_fn_t typedef

Add pcpu_fc_cpu_to_node_fn_t and pass it into pcpu_fc_alloc_fn_t, pcpu
first chunk allocation will call it to alloc memblock on the
corresponding node by it, this is prepare for the next patch.

Link: https://lkml.kernel.org/r/20211216112359.103822-3-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/percpu.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index ae4004e7957e..e4078bf45fd5 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -94,8 +94,9 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR];
 
 extern enum pcpu_fc pcpu_chosen_fc;
 
-typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size,
-				     size_t align);
+typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu);
+typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, size_t align,
+				     pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
@@ -111,12 +112,14 @@ extern void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
+				pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 extern int __init pcpu_page_first_chunk(size_t reserved_size,
+				pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
-- 
cgit v1.2.3


From 23f917169ef157aa7a6bf80d8c4aad6f1282852c Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 19 Jan 2022 18:07:49 -0800
Subject: mm: percpu: add generic pcpu_fc_alloc/free funciton

With the previous patch, we could add a generic pcpu first chunk
allocate and free function to cleanup the duplicated definations on each
architecture.

Link: https://lkml.kernel.org/r/20211216112359.103822-4-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/percpu.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index e4078bf45fd5..d73c97ef4ff4 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -95,9 +95,6 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR];
 extern enum pcpu_fc pcpu_chosen_fc;
 
 typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu);
-typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, size_t align,
-				     pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn);
-typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
 
@@ -112,16 +109,12 @@ extern void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
-				pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn,
-				pcpu_fc_alloc_fn_t alloc_fn,
-				pcpu_fc_free_fn_t free_fn);
+				pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 extern int __init pcpu_page_first_chunk(size_t reserved_size,
 				pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn,
-				pcpu_fc_alloc_fn_t alloc_fn,
-				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 #endif
 
-- 
cgit v1.2.3


From 20c035764626c56c4f6514936b9ee4be0f4cd962 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 19 Jan 2022 18:07:53 -0800
Subject: mm: percpu: add generic pcpu_populate_pte() function

With NEED_PER_CPU_PAGE_FIRST_CHUNK enabled, we need a function to
populate pte, this patch adds a generic pcpu populate pte function,
pcpu_populate_pte(), which is marked __weak and used on most
architectures, but it is overridden on x86, which has its own
implementation.

Link: https://lkml.kernel.org/r/20211216112359.103822-5-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/percpu.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index d73c97ef4ff4..f1ec5ad1351c 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -95,7 +95,6 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR];
 extern enum pcpu_fc pcpu_chosen_fc;
 
 typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu);
-typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
 
 extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
@@ -113,9 +112,9 @@ extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+void __init pcpu_populate_pte(unsigned long addr);
 extern int __init pcpu_page_first_chunk(size_t reserved_size,
-				pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn,
-				pcpu_fc_populate_pte_fn_t populate_pte_fn);
+				pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn);
 #endif
 
 extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1);
-- 
cgit v1.2.3


From ae62fbe299629d3b2fa61d4cf5146258c4d99fdf Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 19 Jan 2022 18:08:00 -0800
Subject: proc: make the proc_create[_data]() stubs static inlines

Change the proc_create[_data]() stubs which are used when CONFIG_PROC_FS
is not set from #defines to a static inline stubs.

This should fix clang -Werror builds failing due to errors like this:

  drivers/platform/x86/thinkpad_acpi.c:918:30: error: unused variable
   'dispatch_proc_ops' [-Werror,-Wunused-const-variable]

Fixing this in include/linux/proc_fs.h should ensure that the same issue
is also fixed in any other drivers hitting the same -Werror issue.

[akpm@linux-foundation.org: fix CONFIG_PROC_FS=n]
[akpm@linux-foundation.org: fix arch/sparc/kernel/led.c]
[akpm@linux-foundation.org: fix build]

Link: https://lkml.kernel.org/r/20211116131112.508304-1-hdegoede@redhat.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/proc_fs.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 069c7fd95396..01b9268451a8 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -178,8 +178,16 @@ static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 #define proc_create_seq(name, mode, parent, ops) ({NULL;})
 #define proc_create_single(name, mode, parent, show) ({NULL;})
 #define proc_create_single_data(name, mode, parent, show, data) ({NULL;})
-#define proc_create(name, mode, parent, proc_ops) ({NULL;})
-#define proc_create_data(name, mode, parent, proc_ops, data) ({NULL;})
+
+static inline struct proc_dir_entry *
+proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent,
+	    const struct proc_ops *proc_ops)
+{ return NULL; }
+
+static inline struct proc_dir_entry *
+proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent,
+		 const struct proc_ops *proc_ops, void *data)
+{ return NULL; }
 
 static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {}
 static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {}
-- 
cgit v1.2.3


From 22c033989c3eb9731ad0c497dfab4231b8e367d6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 19 Jan 2022 18:08:12 -0800
Subject: include/linux/unaligned: replace kernel.h with the necessary
 inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

The rest of the changes are induced by the above and may not be split.

Link: https://lkml.kernel.org/r/20211209123823.20425-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>	[brcmfmac]
Acked-by: Kalle Valo <kvalo@kernel.org>
Cc: Arend van Spriel <aspriel@gmail.com>
Cc: Franky Lin <franky.lin@broadcom.com>
Cc: Hante Meuleman <hante.meuleman@broadcom.com>
Cc: Chi-hsien Lin <chi-hsien.lin@infineon.com>
Cc: Wright Feng <wright.feng@infineon.com>
Cc: Chung-hsien Hsu <chung-hsien.hsu@infineon.com>
Cc: Kalle Valo <kvalo@codeaurora.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/unaligned/packed_struct.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/unaligned/packed_struct.h b/include/linux/unaligned/packed_struct.h
index c0d817de4df2..f4c8eaf4d012 100644
--- a/include/linux/unaligned/packed_struct.h
+++ b/include/linux/unaligned/packed_struct.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_UNALIGNED_PACKED_STRUCT_H
 #define _LINUX_UNALIGNED_PACKED_STRUCT_H
 
-#include <linux/kernel.h>
+#include <linux/types.h>
 
 struct __una_u16 { u16 x; } __packed;
 struct __una_u32 { u32 x; } __packed;
-- 
cgit v1.2.3


From 40cbf09f060c8febef64541c463d4dd526abe445 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 19 Jan 2022 18:08:16 -0800
Subject: kernel.h: include a note to discourage people from including it in
 headers

Include a note at the top to discourage people from including it in
headers.

Link: https://lkml.kernel.org/r/20211209150803.4473-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 77755ac3e189..36a612d82956 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -1,4 +1,13 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NOTE:
+ *
+ * This header has combined a lot of unrelated to each other stuff.
+ * The process of splitting its content is in progress while keeping
+ * backward compatibility. That's why it's highly recommended NOT to
+ * include this header inside another header file, especially under
+ * generic or architectural include/ directory.
+ */
 #ifndef _LINUX_KERNEL_H
 #define _LINUX_KERNEL_H
 
-- 
cgit v1.2.3


From 95af469c4f609de011debc08e7a35b45201623a8 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 19 Jan 2022 18:08:29 -0800
Subject: fs/binfmt_elf: replace open-coded string copy with get_task_comm

It is better to use get_task_comm() instead of the open coded string
copy as we do in other places.

struct elf_prpsinfo is used to dump the task information in userspace
coredump or kernel vmcore.  Below is the verification of vmcore,

  crash> ps
     PID    PPID  CPU       TASK        ST  %MEM     VSZ    RSS  COMM
        0      0   0  ffffffff9d21a940  RU   0.0       0      0  [swapper/0]
  >     0      0   1  ffffa09e40f85e80  RU   0.0       0      0  [swapper/1]
  >     0      0   2  ffffa09e40f81f80  RU   0.0       0      0  [swapper/2]
  >     0      0   3  ffffa09e40f83f00  RU   0.0       0      0  [swapper/3]
  >     0      0   4  ffffa09e40f80000  RU   0.0       0      0  [swapper/4]
  >     0      0   5  ffffa09e40f89f80  RU   0.0       0      0  [swapper/5]
        0      0   6  ffffa09e40f8bf00  RU   0.0       0      0  [swapper/6]
  >     0      0   7  ffffa09e40f88000  RU   0.0       0      0  [swapper/7]
  >     0      0   8  ffffa09e40f8de80  RU   0.0       0      0  [swapper/8]
  >     0      0   9  ffffa09e40f95e80  RU   0.0       0      0  [swapper/9]
  >     0      0  10  ffffa09e40f91f80  RU   0.0       0      0  [swapper/10]
  >     0      0  11  ffffa09e40f93f00  RU   0.0       0      0  [swapper/11]
  >     0      0  12  ffffa09e40f90000  RU   0.0       0      0  [swapper/12]
  >     0      0  13  ffffa09e40f9bf00  RU   0.0       0      0  [swapper/13]
  >     0      0  14  ffffa09e40f98000  RU   0.0       0      0  [swapper/14]
  >     0      0  15  ffffa09e40f9de80  RU   0.0       0      0  [swapper/15]

It works well as expected.

Some comments are added to explain why we use the hard-coded 16.

Link: https://lkml.kernel.org/r/20211120112738.45980-5-laoar.shao@gmail.com
Suggested-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: Michal Miroslaw <mirq-linux@rere.qmqm.pl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/elfcore-compat.h | 5 +++++
 include/linux/elfcore.h        | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/elfcore-compat.h b/include/linux/elfcore-compat.h
index e272c3d452ce..54feb64e9b5d 100644
--- a/include/linux/elfcore-compat.h
+++ b/include/linux/elfcore-compat.h
@@ -43,6 +43,11 @@ struct compat_elf_prpsinfo
 	__compat_uid_t			pr_uid;
 	__compat_gid_t			pr_gid;
 	compat_pid_t			pr_pid, pr_ppid, pr_pgrp, pr_sid;
+	/*
+	 * The hard-coded 16 is derived from TASK_COMM_LEN, but it can't be
+	 * changed as it is exposed to userspace. We'd better make it hard-coded
+	 * here.
+	 */
 	char				pr_fname[16];
 	char				pr_psargs[ELF_PRARGSZ];
 };
diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h
index 957ebec35aad..746e081879a5 100644
--- a/include/linux/elfcore.h
+++ b/include/linux/elfcore.h
@@ -65,6 +65,11 @@ struct elf_prpsinfo
 	__kernel_gid_t	pr_gid;
 	pid_t	pr_pid, pr_ppid, pr_pgrp, pr_sid;
 	/* Lots missing */
+	/*
+	 * The hard-coded 16 is derived from TASK_COMM_LEN, but it can't be
+	 * changed as it is exposed to userspace. We'd better make it hard-coded
+	 * here.
+	 */
 	char	pr_fname[16];	/* filename of executable */
 	char	pr_psargs[ELF_PRARGSZ];	/* initial part of arg list */
 };
-- 
cgit v1.2.3


From 3087c61ed2c48548b74dd343a5209b87082c682d Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 19 Jan 2022 18:08:40 -0800
Subject: tools/testing/selftests/bpf: replace open-coded 16 with TASK_COMM_LEN

As the sched:sched_switch tracepoint args are derived from the kernel,
we'd better make it same with the kernel.  So the macro TASK_COMM_LEN is
converted to type enum, then all the BPF programs can get it through
BTF.

The BPF program which wants to use TASK_COMM_LEN should include the
header vmlinux.h.  Regarding the test_stacktrace_map and
test_tracepoint, as the type defined in linux/bpf.h are also defined in
vmlinux.h, so we don't need to include linux/bpf.h again.

Link: https://lkml.kernel.org/r/20211120112738.45980-8-laoar.shao@gmail.com
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: Michal Miroslaw <mirq-linux@rere.qmqm.pl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78c351e35fec..cecd4806edc6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -274,8 +274,13 @@ struct task_group;
 
 #define get_current_state()	READ_ONCE(current->__state)
 
-/* Task command name length: */
-#define TASK_COMM_LEN			16
+/*
+ * Define the task command name length as enum, then it can be visible to
+ * BPF programs.
+ */
+enum {
+	TASK_COMM_LEN = 16,
+};
 
 extern void scheduler_tick(void);
 
-- 
cgit v1.2.3


From d6986ce24fc00b0638bd29efe8fb7ba7619ed2aa Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 19 Jan 2022 18:08:43 -0800
Subject: kthread: dynamically allocate memory to store kthread's full name

When I was implementing a new per-cpu kthread cfs_migration, I found the
comm of it "cfs_migration/%u" is truncated due to the limitation of
TASK_COMM_LEN.  For example, the comm of the percpu thread on CPU10~19
all have the same name "cfs_migration/1", which will confuse the user.
This issue is not critical, because we can get the corresponding CPU
from the task's Cpus_allowed.  But for kthreads corresponding to other
hardware devices, it is not easy to get the detailed device info from
task comm, for example,

    jbd2/nvme0n1p2-
    xfs-reclaim/sdf

Currently there are so many truncated kthreads:

    rcu_tasks_kthre
    rcu_tasks_rude_
    rcu_tasks_trace
    poll_mpt3sas0_s
    ext4-rsv-conver
    xfs-reclaim/sd{a, b, c, ...}
    xfs-blockgc/sd{a, b, c, ...}
    xfs-inodegc/sd{a, b, c, ...}
    audit_send_repl
    ecryptfs-kthrea
    vfio-irqfd-clea
    jbd2/nvme0n1p2-
    ...

We can shorten these names to work around this problem, but it may be
not applied to all of the truncated kthreads.  Take 'jbd2/nvme0n1p2-'
for example, it is a nice name, and it is not a good idea to shorten it.

One possible way to fix this issue is extending the task comm size, but
as task->comm is used in lots of places, that may cause some potential
buffer overflows.  Another more conservative approach is introducing a
new pointer to store kthread's full name if it is truncated, which won't
introduce too much overhead as it is in the non-critical path.  Finally
we make a dicision to use the second approach.  See also the discussions
in this thread:
https://lore.kernel.org/lkml/20211101060419.4682-1-laoar.shao@gmail.com/

After this change, the full name of these truncated kthreads will be
displayed via /proc/[pid]/comm:

    rcu_tasks_kthread
    rcu_tasks_rude_kthread
    rcu_tasks_trace_kthread
    poll_mpt3sas0_statu
    ext4-rsv-conversion
    xfs-reclaim/sdf1
    xfs-blockgc/sdf1
    xfs-inodegc/sdf1
    audit_send_reply
    ecryptfs-kthread
    vfio-irqfd-cleanup
    jbd2/nvme0n1p2-8

Link: https://lkml.kernel.org/r/20211120112850.46047-1-laoar.shao@gmail.com
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: Michal Miroslaw <mirq-linux@rere.qmqm.pl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kthread.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 346b0f269161..2a5c04494663 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -33,6 +33,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 					  unsigned int cpu,
 					  const char *namefmt);
 
+void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk);
 void set_kthread_struct(struct task_struct *p);
 
 void kthread_set_per_cpu(struct task_struct *k, int cpu);
-- 
cgit v1.2.3


From 0425473037db40d9e322631f2d4dc6ef51f97e88 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 19 Jan 2022 18:08:56 -0800
Subject: list: introduce list_is_head() helper and re-use it in list.h

Introduce list_is_head() in the similar (*) way as it's done for
list_entry_is_head().  Make use of it in the list.h.

*) it's done as inliner and not a macro to be aligned with other
   list_is_*() APIs; while at it, make all three to have the same
   style.

Link: https://lkml.kernel.org/r/20211201141824.81400-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list.h | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 6636fc07f918..dd6c2041d09c 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -258,8 +258,7 @@ static inline void list_bulk_move_tail(struct list_head *head,
  * @list: the entry to test
  * @head: the head of the list
  */
-static inline int list_is_first(const struct list_head *list,
-					const struct list_head *head)
+static inline int list_is_first(const struct list_head *list, const struct list_head *head)
 {
 	return list->prev == head;
 }
@@ -269,12 +268,21 @@ static inline int list_is_first(const struct list_head *list,
  * @list: the entry to test
  * @head: the head of the list
  */
-static inline int list_is_last(const struct list_head *list,
-				const struct list_head *head)
+static inline int list_is_last(const struct list_head *list, const struct list_head *head)
 {
 	return list->next == head;
 }
 
+/**
+ * list_is_head - tests whether @list is the list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_head(const struct list_head *list, const struct list_head *head)
+{
+	return list == head;
+}
+
 /**
  * list_empty - tests whether a list is empty
  * @head: the list to test.
@@ -318,7 +326,7 @@ static inline void list_del_init_careful(struct list_head *entry)
 static inline int list_empty_careful(const struct list_head *head)
 {
 	struct list_head *next = smp_load_acquire(&head->next);
-	return (next == head) && (next == head->prev);
+	return list_is_head(next, head) && (next == head->prev);
 }
 
 /**
@@ -393,10 +401,9 @@ static inline void list_cut_position(struct list_head *list,
 {
 	if (list_empty(head))
 		return;
-	if (list_is_singular(head) &&
-		(head->next != entry && head != entry))
+	if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next))
 		return;
-	if (entry == head)
+	if (list_is_head(entry, head))
 		INIT_LIST_HEAD(list);
 	else
 		__list_cut_position(list, head, entry);
@@ -570,7 +577,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  * @head:	the head for your list.
  */
 #define list_for_each(pos, head) \
-	for (pos = (head)->next; pos != (head); pos = pos->next)
+	for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)
 
 /**
  * list_for_each_continue - continue iteration over a list
@@ -580,7 +587,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  * Continue to iterate over a list, continuing after the current position.
  */
 #define list_for_each_continue(pos, head) \
-	for (pos = pos->next; pos != (head); pos = pos->next)
+	for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next)
 
 /**
  * list_for_each_prev	-	iterate over a list backwards
@@ -588,7 +595,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  * @head:	the head for your list.
  */
 #define list_for_each_prev(pos, head) \
-	for (pos = (head)->prev; pos != (head); pos = pos->prev)
+	for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)
 
 /**
  * list_for_each_safe - iterate over a list safe against removal of list entry
@@ -597,8 +604,9 @@ static inline void list_splice_tail_init(struct list_head *list,
  * @head:	the head for your list.
  */
 #define list_for_each_safe(pos, n, head) \
-	for (pos = (head)->next, n = pos->next; pos != (head); \
-		pos = n, n = pos->next)
+	for (pos = (head)->next, n = pos->next; \
+	     !list_is_head(pos, (head)); \
+	     pos = n, n = pos->next)
 
 /**
  * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
@@ -608,7 +616,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  */
 #define list_for_each_prev_safe(pos, n, head) \
 	for (pos = (head)->prev, n = pos->prev; \
-	     pos != (head); \
+	     !list_is_head(pos, (head)); \
 	     pos = n, n = pos->prev)
 
 /**
-- 
cgit v1.2.3


From fd0a1462405b087377e59b84e119fe7e2d08499a Mon Sep 17 00:00:00 2001
From: Isabella Basso <isabbasso@riseup.net>
Date: Wed, 19 Jan 2022 18:09:02 -0800
Subject: hash.h: remove unused define directive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "test_hash.c: refactor into KUnit", v3.

We refactored the lib/test_hash.c file into KUnit as part of the student
group LKCAMP [1] introductory hackathon for kernel development.

This test was pointed to our group by Daniel Latypov [2], so its full
conversion into a pure KUnit test was our goal in this patch series, but
we ran into many problems relating to it not being split as unit tests,
which complicated matters a bit, as the reasoning behind the original
tests is quite cryptic for those unfamiliar with hash implementations.

Some interesting developments we'd like to highlight are:

 - In patch 1/5 we noticed that there was an unused define directive
   that could be removed.

 - In patch 4/5 we noticed how stringhash and hash tests are all under
   the lib/test_hash.c file, which might cause some confusion, and we
   also broke those kernel config entries up.

Overall KUnit developments have been made in the other patches in this
series:

In patches 2/5, 3/5 and 5/5 we refactored the lib/test_hash.c file so as
to make it more compatible with the KUnit style, whilst preserving the
original idea of the maintainer who designed it (i.e.  George Spelvin),
which might be undesirable for unit tests, but we assume it is enough
for a first patch.

This patch (of 5):

Currently, there exist hash_32() and __hash_32() functions, which were
introduced in a patch [1] targeting architecture specific optimizations.
These functions can be overridden on a per-architecture basis to achieve
such optimizations.  They must set their corresponding define directive
(HAVE_ARCH_HASH_32 and HAVE_ARCH__HASH_32, respectively) so that header
files can deal with these overrides properly.

As the supported 32-bit architectures that have their own hash function
implementation (i.e.  m68k, Microblaze, H8/300, pa-risc) have only been
making use of the (more general) __hash_32() function (which only lacks
a right shift operation when compared to the hash_32() function), remove
the define directive corresponding to the arch-specific hash_32()
implementation.

[1] https://lore.kernel.org/lkml/20160525073311.5600.qmail@ns.sciencehorizons.net/

[akpm@linux-foundation.org: hash_32_generic() becomes hash_32()]

Link: https://lkml.kernel.org/r/20211208183711.390454-1-isabbasso@riseup.net
Link: https://lkml.kernel.org/r/20211208183711.390454-2-isabbasso@riseup.net
Reviewed-by: David Gow <davidgow@google.com>
Tested-by: David Gow <davidgow@google.com>
Co-developed-by: Augusto Durães Camargo <augusto.duraes33@gmail.com>
Signed-off-by: Augusto Durães Camargo <augusto.duraes33@gmail.com>
Co-developed-by: Enzo Ferreira <ferreiraenzoa@gmail.com>
Signed-off-by: Enzo Ferreira <ferreiraenzoa@gmail.com>
Signed-off-by: Isabella Basso <isabbasso@riseup.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Daniel Latypov <dlatypov@google.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hash.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hash.h b/include/linux/hash.h
index ad6fa21d977b..38edaa08f862 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -62,10 +62,7 @@ static inline u32 __hash_32_generic(u32 val)
 	return val * GOLDEN_RATIO_32;
 }
 
-#ifndef HAVE_ARCH_HASH_32
-#define hash_32 hash_32_generic
-#endif
-static inline u32 hash_32_generic(u32 val, unsigned int bits)
+static inline u32 hash_32(u32 val, unsigned int bits)
 {
 	/* High bits are more random, so use them. */
 	return __hash_32(val) >> (32 - bits);
-- 
cgit v1.2.3


From a3d5dc908a5f572ce3e31fe83fd2459a1c3c5422 Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Wed, 19 Jan 2022 18:10:02 -0800
Subject: delayacct: support swapin delay accounting for swapping without blkio

Currently delayacct accounts swapin delay only for swapping that cause
blkio.  If we use zram for swapping, tools/accounting/getdelays can't
get any SWAP delay.

It's useful to get zram swapin delay information, for example to adjust
compress algorithm or /proc/sys/vm/swappiness.

Reference to PSI, it accounts any kind of swapping by doing its work in
swap_readpage(), no matter whether swapping causes blkio.  Let delayacct
do the similar work.

Link: https://lkml.kernel.org/r/20211112083813.8559-1-yang.yang29@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Reported-by: Zeal Robot <zealci@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delayacct.h | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index af7e6eb50283..b96d68f310a2 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -9,14 +9,6 @@
 
 #include <uapi/linux/taskstats.h>
 
-/*
- * Per-task flags relevant to delay accounting
- * maintained privately to avoid exhausting similar flags in sched.h:PF_*
- * Used to set current->delays->flags
- */
-#define DELAYACCT_PF_SWAPIN	0x00000001	/* I am doing a swapin */
-#define DELAYACCT_PF_BLKIO	0x00000002	/* I am waiting on IO */
-
 #ifdef CONFIG_TASK_DELAY_ACCT
 struct task_delay_info {
 	raw_spinlock_t	lock;
@@ -37,13 +29,13 @@ struct task_delay_info {
 	 * associated with the operation is added to XXX_delay.
 	 * XXX_delay contains the accumulated delay time in nanoseconds.
 	 */
-	u64 blkio_start;	/* Shared by blkio, swapin */
+	u64 blkio_start;
 	u64 blkio_delay;	/* wait for sync block io completion */
-	u64 swapin_delay;	/* wait for swapin block io completion */
+	u64 swapin_start;
+	u64 swapin_delay;	/* wait for swapin */
 	u32 blkio_count;	/* total count of the number of sync block */
 				/* io operations performed */
-	u32 swapin_count;	/* total count of the number of swapin block */
-				/* io operations performed */
+	u32 swapin_count;	/* total count of swapin */
 
 	u64 freepages_start;
 	u64 freepages_delay;	/* wait for memory reclaim */
@@ -79,14 +71,8 @@ extern void __delayacct_freepages_start(void);
 extern void __delayacct_freepages_end(void);
 extern void __delayacct_thrashing_start(void);
 extern void __delayacct_thrashing_end(void);
-
-static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
-{
-	if (p->delays)
-		return (p->delays->flags & DELAYACCT_PF_BLKIO);
-	else
-		return 0;
-}
+extern void __delayacct_swapin_start(void);
+extern void __delayacct_swapin_end(void);
 
 static inline void delayacct_set_flag(struct task_struct *p, int flag)
 {
@@ -123,7 +109,6 @@ static inline void delayacct_blkio_start(void)
 	if (!static_branch_unlikely(&delayacct_key))
 		return;
 
-	delayacct_set_flag(current, DELAYACCT_PF_BLKIO);
 	if (current->delays)
 		__delayacct_blkio_start();
 }
@@ -135,7 +120,6 @@ static inline void delayacct_blkio_end(struct task_struct *p)
 
 	if (p->delays)
 		__delayacct_blkio_end(p);
-	delayacct_clear_flag(p, DELAYACCT_PF_BLKIO);
 }
 
 static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
@@ -169,6 +153,18 @@ static inline void delayacct_thrashing_end(void)
 		__delayacct_thrashing_end();
 }
 
+static inline void delayacct_swapin_start(void)
+{
+	if (current->delays)
+		__delayacct_swapin_start();
+}
+
+static inline void delayacct_swapin_end(void)
+{
+	if (current->delays)
+		__delayacct_swapin_end();
+}
+
 #else
 static inline void delayacct_set_flag(struct task_struct *p, int flag)
 {}
@@ -199,6 +195,10 @@ static inline void delayacct_thrashing_start(void)
 {}
 static inline void delayacct_thrashing_end(void)
 {}
+static inline void delayacct_swapin_start(void)
+{}
+static inline void delayacct_swapin_end(void)
+{}
 
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
-- 
cgit v1.2.3


From 82065b7266899fbdce4c7394d7dd02688161f0cf Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Wed, 19 Jan 2022 18:10:06 -0800
Subject: delayacct: fix incomplete disable operation when switch enable to
 disable

When a task is created after delayacct is enabled, kernel will do all
the delay accountings for that task.  The problems is if user disables
delayacct by set /proc/sys/kernel/task_delayacct to zero, only blkio
delay accounting is disabled.

Now disable all the kinds of delay accountings when
/proc/sys/kernel/task_delayacct sets to zero.

Link: https://lkml.kernel.org/r/20211123140342.32962-1-ran.xiaokai@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Reported-by: Zeal Robot <zealci@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delayacct.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index b96d68f310a2..c675cfb6437e 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -131,36 +131,54 @@ static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
 
 static inline void delayacct_freepages_start(void)
 {
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
 	if (current->delays)
 		__delayacct_freepages_start();
 }
 
 static inline void delayacct_freepages_end(void)
 {
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
 	if (current->delays)
 		__delayacct_freepages_end();
 }
 
 static inline void delayacct_thrashing_start(void)
 {
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
 	if (current->delays)
 		__delayacct_thrashing_start();
 }
 
 static inline void delayacct_thrashing_end(void)
 {
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
 	if (current->delays)
 		__delayacct_thrashing_end();
 }
 
 static inline void delayacct_swapin_start(void)
 {
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
 	if (current->delays)
 		__delayacct_swapin_start();
 }
 
 static inline void delayacct_swapin_end(void)
 {
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
 	if (current->delays)
 		__delayacct_swapin_end();
 }
-- 
cgit v1.2.3


From 1193829da1a6728249cd02577a020bd64fd9c160 Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Wed, 19 Jan 2022 18:10:09 -0800
Subject: delayacct: cleanup flags in struct task_delay_info and functions use
 it

Flags in struct task_delay_info is used to distinguish the difference
between swapin and blkio delay acountings.  But after patch "delayacct:
support swapin delay accounting for swapping without blkio", there is no
need to do that since swapin and blkio delay accounting use their own
functions.

Link: https://lkml.kernel.org/r/20211124065958.36703-1-yang.yang29@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delayacct.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index c675cfb6437e..435c3654a0ff 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -12,7 +12,6 @@
 #ifdef CONFIG_TASK_DELAY_ACCT
 struct task_delay_info {
 	raw_spinlock_t	lock;
-	unsigned int	flags;	/* Private per-task flags */
 
 	/* For each stat XXX, add following, aligned appropriately
 	 *
@@ -74,18 +73,6 @@ extern void __delayacct_thrashing_end(void);
 extern void __delayacct_swapin_start(void);
 extern void __delayacct_swapin_end(void);
 
-static inline void delayacct_set_flag(struct task_struct *p, int flag)
-{
-	if (p->delays)
-		p->delays->flags |= flag;
-}
-
-static inline void delayacct_clear_flag(struct task_struct *p, int flag)
-{
-	if (p->delays)
-		p->delays->flags &= ~flag;
-}
-
 static inline void delayacct_tsk_init(struct task_struct *tsk)
 {
 	/* reinitialize in case parent's non-null pointer was dup'ed*/
@@ -184,10 +171,6 @@ static inline void delayacct_swapin_end(void)
 }
 
 #else
-static inline void delayacct_set_flag(struct task_struct *p, int flag)
-{}
-static inline void delayacct_clear_flag(struct task_struct *p, int flag)
-{}
 static inline void delayacct_init(void)
 {}
 static inline void delayacct_tsk_init(struct task_struct *tsk)
-- 
cgit v1.2.3


From 5bf18281534451bf1ad56a45a3085cd7ad46860d Mon Sep 17 00:00:00 2001
From: wangyong <wang.yong12@zte.com.cn>
Date: Wed, 19 Jan 2022 18:10:15 -0800
Subject: delayacct: track delays from memory compact

Delay accounting does not track the delay of memory compact.  When there
is not enough free memory, tasks can spend a amount of their time
waiting for compact.

To get the impact of tasks in direct memory compact, measure the delay
when allocating memory through memory compact.

Also update tools/accounting/getdelays.c:

    / # ./getdelays_next  -di -p 304
    print delayacct stats ON
    printing IO accounting
    PID     304

    CPU             count     real total  virtual total    delay total  delay average
                      277      780000000      849039485       18877296          0.068ms
    IO              count    delay total  delay average
                        0              0              0ms
    SWAP            count    delay total  delay average
                        0              0              0ms
    RECLAIM         count    delay total  delay average
                        5    11088812685           2217ms
    THRASHING       count    delay total  delay average
                        0              0              0ms
    COMPACT         count    delay total  delay average
                        3          72758              0ms
    watch: read=0, write=0, cancelled_write=0

Link: https://lkml.kernel.org/r/1638619795-71451-1-git-send-email-wang.yong12@zte.com.cn
Signed-off-by: wangyong <wang.yong12@zte.com.cn>
Reviewed-by: Jiang Xuexin <jiang.xuexin@zte.com.cn>
Reviewed-by: Zhang Wenya <zhang.wenya1@zte.com.cn>
Reviewed-by: Yang Yang <yang.yang29@zte.com.cn>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delayacct.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 435c3654a0ff..3e03d010bd2e 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -42,8 +42,12 @@ struct task_delay_info {
 	u64 thrashing_start;
 	u64 thrashing_delay;	/* wait for thrashing page */
 
+	u64 compact_start;
+	u64 compact_delay;	/* wait for memory compact */
+
 	u32 freepages_count;	/* total count of memory reclaim */
 	u32 thrashing_count;	/* total count of thrash waits */
+	u32 compact_count;	/* total count of memory compact */
 };
 #endif
 
@@ -72,6 +76,8 @@ extern void __delayacct_thrashing_start(void);
 extern void __delayacct_thrashing_end(void);
 extern void __delayacct_swapin_start(void);
 extern void __delayacct_swapin_end(void);
+extern void __delayacct_compact_start(void);
+extern void __delayacct_compact_end(void);
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
 {
@@ -170,6 +176,24 @@ static inline void delayacct_swapin_end(void)
 		__delayacct_swapin_end();
 }
 
+static inline void delayacct_compact_start(void)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (current->delays)
+		__delayacct_compact_start();
+}
+
+static inline void delayacct_compact_end(void)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (current->delays)
+		__delayacct_compact_end();
+}
+
 #else
 static inline void delayacct_init(void)
 {}
@@ -200,6 +224,10 @@ static inline void delayacct_swapin_start(void)
 {}
 static inline void delayacct_swapin_end(void)
 {}
+static inline void delayacct_compact_start(void)
+{}
+static inline void delayacct_compact_end(void)
+{}
 
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
-- 
cgit v1.2.3


From 47934e06b65637c88a762d9c98329ae6e3238888 Mon Sep 17 00:00:00 2001
From: Congyu Liu <liu3101@purdue.edu>
Date: Tue, 18 Jan 2022 14:20:13 -0500
Subject: net: fix information leakage in /proc/net/ptype

In one net namespace, after creating a packet socket without binding
it to a device, users in other net namespaces can observe the new
`packet_type` added by this packet socket by reading `/proc/net/ptype`
file. This is minor information leakage as packet socket is
namespace aware.

Add a net pointer in `packet_type` to keep the net namespace of
of corresponding packet socket. In `ptype_seq_show`, this net pointer
must be checked when it is not NULL.

Fixes: 2feb27dbe00c ("[NETNS]: Minor information leak via /proc/net/ptype file.")
Signed-off-by: Congyu Liu <liu3101@purdue.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3213c7227b59..e490b84732d1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2548,6 +2548,7 @@ struct packet_type {
 					      struct net_device *);
 	bool			(*id_match)(struct packet_type *ptype,
 					    struct sock *sk);
+	struct net		*af_packet_net;
 	void			*af_packet_priv;
 	struct list_head	list;
 };
-- 
cgit v1.2.3


From e2f08207c558bc0bc8abaa557cdb29bad776ac7b Mon Sep 17 00:00:00 2001
From: Moshe Tal <moshet@nvidia.com>
Date: Thu, 20 Jan 2022 11:55:50 +0200
Subject: ethtool: Fix link extended state for big endian

The link extended sub-states are assigned as enum that is an integer
size but read from a union as u8, this is working for small values on
little endian systems but for big endian this always give 0. Fix the
variable in the union to match the enum size.

Fixes: ecc31c60240b ("ethtool: Add link extended state")
Signed-off-by: Moshe Tal <moshet@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index a26f37a27167..11efc45de66a 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -111,7 +111,7 @@ struct ethtool_link_ext_state_info {
 		enum ethtool_link_ext_substate_bad_signal_integrity bad_signal_integrity;
 		enum ethtool_link_ext_substate_cable_issue cable_issue;
 		enum ethtool_link_ext_substate_module module;
-		u8 __link_ext_substate;
+		u32 __link_ext_substate;
 	};
 };
 
-- 
cgit v1.2.3


From 66a8f7f04979f4ad739085f01d99c8caf620b4f5 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Tue, 18 Jan 2022 18:35:02 +0100
Subject: of: base: make small of_parse_phandle() variants static inline

Make all the smaller variants of the of_parse_phandle() static inline.
This also let us remove the empty function stubs if CONFIG_OF is not
defined.

Suggested-by: Rob Herring <robh@kernel.org>
Signed-off-by: Michael Walle <michael@walle.cc>
[robh: move index < 0 check into __of_parse_phandle_with_args]
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220118173504.2867523-2-michael@walle.cc
---
 include/linux/of.h | 148 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 120 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index ff143a027abc..16d76c92fbe0 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -364,18 +364,12 @@ extern const struct of_device_id *of_match_node(
 	const struct of_device_id *matches, const struct device_node *node);
 extern int of_modalias_node(struct device_node *node, char *modalias, int len);
 extern void of_print_phandle_args(const char *msg, const struct of_phandle_args *args);
-extern struct device_node *of_parse_phandle(const struct device_node *np,
-					    const char *phandle_name,
-					    int index);
-extern int of_parse_phandle_with_args(const struct device_node *np,
-	const char *list_name, const char *cells_name, int index,
-	struct of_phandle_args *out_args);
+extern int __of_parse_phandle_with_args(const struct device_node *np,
+	const char *list_name, const char *cells_name, int cell_count,
+	int index, struct of_phandle_args *out_args);
 extern int of_parse_phandle_with_args_map(const struct device_node *np,
 	const char *list_name, const char *stem_name, int index,
 	struct of_phandle_args *out_args);
-extern int of_parse_phandle_with_fixed_args(const struct device_node *np,
-	const char *list_name, int cells_count, int index,
-	struct of_phandle_args *out_args);
 extern int of_count_phandle_with_args(const struct device_node *np,
 	const char *list_name, const char *cells_name);
 
@@ -865,18 +859,12 @@ static inline int of_property_read_string_helper(const struct device_node *np,
 	return -ENOSYS;
 }
 
-static inline struct device_node *of_parse_phandle(const struct device_node *np,
-						   const char *phandle_name,
-						   int index)
-{
-	return NULL;
-}
-
-static inline int of_parse_phandle_with_args(const struct device_node *np,
-					     const char *list_name,
-					     const char *cells_name,
-					     int index,
-					     struct of_phandle_args *out_args)
+static inline int __of_parse_phandle_with_args(const struct device_node *np,
+					       const char *list_name,
+					       const char *cells_name,
+					       int cell_count,
+					       int index,
+					       struct of_phandle_args *out_args)
 {
 	return -ENOSYS;
 }
@@ -890,13 +878,6 @@ static inline int of_parse_phandle_with_args_map(const struct device_node *np,
 	return -ENOSYS;
 }
 
-static inline int of_parse_phandle_with_fixed_args(const struct device_node *np,
-	const char *list_name, int cells_count, int index,
-	struct of_phandle_args *out_args)
-{
-	return -ENOSYS;
-}
-
 static inline int of_count_phandle_with_args(const struct device_node *np,
 					     const char *list_name,
 					     const char *cells_name)
@@ -1077,6 +1058,117 @@ static inline bool of_node_is_type(const struct device_node *np, const char *typ
 	return np && match && type && !strcmp(match, type);
 }
 
+/**
+ * of_parse_phandle - Resolve a phandle property to a device_node pointer
+ * @np: Pointer to device node holding phandle property
+ * @phandle_name: Name of property holding a phandle value
+ * @index: For properties holding a table of phandles, this is the index into
+ *         the table
+ *
+ * Return: The device_node pointer with refcount incremented.  Use
+ * of_node_put() on it when done.
+ */
+static inline struct device_node *of_parse_phandle(const struct device_node *np,
+						   const char *phandle_name,
+						   int index)
+{
+	struct of_phandle_args args;
+
+	if (__of_parse_phandle_with_args(np, phandle_name, NULL, 0,
+					 index, &args))
+		return NULL;
+
+	return args.np;
+}
+
+/**
+ * of_parse_phandle_with_args() - Find a node pointed by phandle in a list
+ * @np:		pointer to a device tree node containing a list
+ * @list_name:	property name that contains a list
+ * @cells_name:	property name that specifies phandles' arguments count
+ * @index:	index of a phandle to parse out
+ * @out_args:	optional pointer to output arguments structure (will be filled)
+ *
+ * This function is useful to parse lists of phandles and their arguments.
+ * Returns 0 on success and fills out_args, on error returns appropriate
+ * errno value.
+ *
+ * Caller is responsible to call of_node_put() on the returned out_args->np
+ * pointer.
+ *
+ * Example::
+ *
+ *  phandle1: node1 {
+ *	#list-cells = <2>;
+ *  };
+ *
+ *  phandle2: node2 {
+ *	#list-cells = <1>;
+ *  };
+ *
+ *  node3 {
+ *	list = <&phandle1 1 2 &phandle2 3>;
+ *  };
+ *
+ * To get a device_node of the ``node2`` node you may call this:
+ * of_parse_phandle_with_args(node3, "list", "#list-cells", 1, &args);
+ */
+static inline int of_parse_phandle_with_args(const struct device_node *np,
+					     const char *list_name,
+					     const char *cells_name,
+					     int index,
+					     struct of_phandle_args *out_args)
+{
+	int cell_count = -1;
+
+	/* If cells_name is NULL we assume a cell count of 0 */
+	if (!cells_name)
+		cell_count = 0;
+
+	return __of_parse_phandle_with_args(np, list_name, cells_name,
+					    cell_count, index, out_args);
+}
+
+/**
+ * of_parse_phandle_with_fixed_args() - Find a node pointed by phandle in a list
+ * @np:		pointer to a device tree node containing a list
+ * @list_name:	property name that contains a list
+ * @cell_count: number of argument cells following the phandle
+ * @index:	index of a phandle to parse out
+ * @out_args:	optional pointer to output arguments structure (will be filled)
+ *
+ * This function is useful to parse lists of phandles and their arguments.
+ * Returns 0 on success and fills out_args, on error returns appropriate
+ * errno value.
+ *
+ * Caller is responsible to call of_node_put() on the returned out_args->np
+ * pointer.
+ *
+ * Example::
+ *
+ *  phandle1: node1 {
+ *  };
+ *
+ *  phandle2: node2 {
+ *  };
+ *
+ *  node3 {
+ *	list = <&phandle1 0 2 &phandle2 2 3>;
+ *  };
+ *
+ * To get a device_node of the ``node2`` node you may call this:
+ * of_parse_phandle_with_fixed_args(node3, "list", 2, 1, &args);
+ */
+static inline int of_parse_phandle_with_fixed_args(const struct device_node *np,
+						   const char *list_name,
+						   int cell_count,
+						   int index,
+						   struct of_phandle_args *out_args)
+{
+	return __of_parse_phandle_with_args(np, list_name, NULL, cell_count,
+					    index, out_args);
+}
+
 /**
  * of_property_count_u8_elems - Count the number of u8 elements in a property
  *
-- 
cgit v1.2.3


From 2ca42c3ad9ed875b136065b010753a4caaaa1d38 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Tue, 18 Jan 2022 18:35:03 +0100
Subject: of: property: define of_property_read_u{8,16,32,64}_array()
 unconditionally

We can get rid of all the empty stubs because all these functions call
of_property_read_variable_u{8,16,32,64}_array() which already have an
empty stub if CONFIG_OF is not defined.

Signed-off-by: Michael Walle <michael@walle.cc>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220118173504.2867523-3-michael@walle.cc
---
 include/linux/of.h | 274 ++++++++++++++++++++++++-----------------------------
 1 file changed, 124 insertions(+), 150 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index 16d76c92fbe0..2dc77430a91a 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -410,130 +410,6 @@ extern int of_detach_node(struct device_node *);
 
 #define of_match_ptr(_ptr)	(_ptr)
 
-/**
- * of_property_read_u8_array - Find and read an array of u8 from a property.
- *
- * @np:		device node from which the property value is to be read.
- * @propname:	name of the property to be searched.
- * @out_values:	pointer to return value, modified only if return value is 0.
- * @sz:		number of array elements to read
- *
- * Search for a property in a device node and read 8-bit value(s) from
- * it.
- *
- * dts entry of array should be like:
- *  ``property = /bits/ 8 <0x50 0x60 0x70>;``
- *
- * Return: 0 on success, -EINVAL if the property does not exist,
- * -ENODATA if property does not have a value, and -EOVERFLOW if the
- * property data isn't large enough.
- *
- * The out_values is modified only if a valid u8 value can be decoded.
- */
-static inline int of_property_read_u8_array(const struct device_node *np,
-					    const char *propname,
-					    u8 *out_values, size_t sz)
-{
-	int ret = of_property_read_variable_u8_array(np, propname, out_values,
-						     sz, 0);
-	if (ret >= 0)
-		return 0;
-	else
-		return ret;
-}
-
-/**
- * of_property_read_u16_array - Find and read an array of u16 from a property.
- *
- * @np:		device node from which the property value is to be read.
- * @propname:	name of the property to be searched.
- * @out_values:	pointer to return value, modified only if return value is 0.
- * @sz:		number of array elements to read
- *
- * Search for a property in a device node and read 16-bit value(s) from
- * it.
- *
- * dts entry of array should be like:
- *  ``property = /bits/ 16 <0x5000 0x6000 0x7000>;``
- *
- * Return: 0 on success, -EINVAL if the property does not exist,
- * -ENODATA if property does not have a value, and -EOVERFLOW if the
- * property data isn't large enough.
- *
- * The out_values is modified only if a valid u16 value can be decoded.
- */
-static inline int of_property_read_u16_array(const struct device_node *np,
-					     const char *propname,
-					     u16 *out_values, size_t sz)
-{
-	int ret = of_property_read_variable_u16_array(np, propname, out_values,
-						      sz, 0);
-	if (ret >= 0)
-		return 0;
-	else
-		return ret;
-}
-
-/**
- * of_property_read_u32_array - Find and read an array of 32 bit integers
- * from a property.
- *
- * @np:		device node from which the property value is to be read.
- * @propname:	name of the property to be searched.
- * @out_values:	pointer to return value, modified only if return value is 0.
- * @sz:		number of array elements to read
- *
- * Search for a property in a device node and read 32-bit value(s) from
- * it.
- *
- * Return: 0 on success, -EINVAL if the property does not exist,
- * -ENODATA if property does not have a value, and -EOVERFLOW if the
- * property data isn't large enough.
- *
- * The out_values is modified only if a valid u32 value can be decoded.
- */
-static inline int of_property_read_u32_array(const struct device_node *np,
-					     const char *propname,
-					     u32 *out_values, size_t sz)
-{
-	int ret = of_property_read_variable_u32_array(np, propname, out_values,
-						      sz, 0);
-	if (ret >= 0)
-		return 0;
-	else
-		return ret;
-}
-
-/**
- * of_property_read_u64_array - Find and read an array of 64 bit integers
- * from a property.
- *
- * @np:		device node from which the property value is to be read.
- * @propname:	name of the property to be searched.
- * @out_values:	pointer to return value, modified only if return value is 0.
- * @sz:		number of array elements to read
- *
- * Search for a property in a device node and read 64-bit value(s) from
- * it.
- *
- * Return: 0 on success, -EINVAL if the property does not exist,
- * -ENODATA if property does not have a value, and -EOVERFLOW if the
- * property data isn't large enough.
- *
- * The out_values is modified only if a valid u64 value can be decoded.
- */
-static inline int of_property_read_u64_array(const struct device_node *np,
-					     const char *propname,
-					     u64 *out_values, size_t sz)
-{
-	int ret = of_property_read_variable_u64_array(np, propname, out_values,
-						      sz, 0);
-	if (ret >= 0)
-		return 0;
-	else
-		return ret;
-}
-
 /*
  * struct property *prop;
  * const __be32 *p;
@@ -728,32 +604,6 @@ static inline int of_property_count_elems_of_size(const struct device_node *np,
 	return -ENOSYS;
 }
 
-static inline int of_property_read_u8_array(const struct device_node *np,
-			const char *propname, u8 *out_values, size_t sz)
-{
-	return -ENOSYS;
-}
-
-static inline int of_property_read_u16_array(const struct device_node *np,
-			const char *propname, u16 *out_values, size_t sz)
-{
-	return -ENOSYS;
-}
-
-static inline int of_property_read_u32_array(const struct device_node *np,
-					     const char *propname,
-					     u32 *out_values, size_t sz)
-{
-	return -ENOSYS;
-}
-
-static inline int of_property_read_u64_array(const struct device_node *np,
-					     const char *propname,
-					     u64 *out_values, size_t sz)
-{
-	return -ENOSYS;
-}
-
 static inline int of_property_read_u32_index(const struct device_node *np,
 			const char *propname, u32 index, u32 *out_value)
 {
@@ -1328,6 +1178,130 @@ static inline bool of_property_read_bool(const struct device_node *np,
 	return prop ? true : false;
 }
 
+/**
+ * of_property_read_u8_array - Find and read an array of u8 from a property.
+ *
+ * @np:		device node from which the property value is to be read.
+ * @propname:	name of the property to be searched.
+ * @out_values:	pointer to return value, modified only if return value is 0.
+ * @sz:		number of array elements to read
+ *
+ * Search for a property in a device node and read 8-bit value(s) from
+ * it.
+ *
+ * dts entry of array should be like:
+ *  ``property = /bits/ 8 <0x50 0x60 0x70>;``
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
+ *
+ * The out_values is modified only if a valid u8 value can be decoded.
+ */
+static inline int of_property_read_u8_array(const struct device_node *np,
+					    const char *propname,
+					    u8 *out_values, size_t sz)
+{
+	int ret = of_property_read_variable_u8_array(np, propname, out_values,
+						     sz, 0);
+	if (ret >= 0)
+		return 0;
+	else
+		return ret;
+}
+
+/**
+ * of_property_read_u16_array - Find and read an array of u16 from a property.
+ *
+ * @np:		device node from which the property value is to be read.
+ * @propname:	name of the property to be searched.
+ * @out_values:	pointer to return value, modified only if return value is 0.
+ * @sz:		number of array elements to read
+ *
+ * Search for a property in a device node and read 16-bit value(s) from
+ * it.
+ *
+ * dts entry of array should be like:
+ *  ``property = /bits/ 16 <0x5000 0x6000 0x7000>;``
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
+ *
+ * The out_values is modified only if a valid u16 value can be decoded.
+ */
+static inline int of_property_read_u16_array(const struct device_node *np,
+					     const char *propname,
+					     u16 *out_values, size_t sz)
+{
+	int ret = of_property_read_variable_u16_array(np, propname, out_values,
+						      sz, 0);
+	if (ret >= 0)
+		return 0;
+	else
+		return ret;
+}
+
+/**
+ * of_property_read_u32_array - Find and read an array of 32 bit integers
+ * from a property.
+ *
+ * @np:		device node from which the property value is to be read.
+ * @propname:	name of the property to be searched.
+ * @out_values:	pointer to return value, modified only if return value is 0.
+ * @sz:		number of array elements to read
+ *
+ * Search for a property in a device node and read 32-bit value(s) from
+ * it.
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
+ *
+ * The out_values is modified only if a valid u32 value can be decoded.
+ */
+static inline int of_property_read_u32_array(const struct device_node *np,
+					     const char *propname,
+					     u32 *out_values, size_t sz)
+{
+	int ret = of_property_read_variable_u32_array(np, propname, out_values,
+						      sz, 0);
+	if (ret >= 0)
+		return 0;
+	else
+		return ret;
+}
+
+/**
+ * of_property_read_u64_array - Find and read an array of 64 bit integers
+ * from a property.
+ *
+ * @np:		device node from which the property value is to be read.
+ * @propname:	name of the property to be searched.
+ * @out_values:	pointer to return value, modified only if return value is 0.
+ * @sz:		number of array elements to read
+ *
+ * Search for a property in a device node and read 64-bit value(s) from
+ * it.
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
+ *
+ * The out_values is modified only if a valid u64 value can be decoded.
+ */
+static inline int of_property_read_u64_array(const struct device_node *np,
+					     const char *propname,
+					     u64 *out_values, size_t sz)
+{
+	int ret = of_property_read_variable_u64_array(np, propname, out_values,
+						      sz, 0);
+	if (ret >= 0)
+		return 0;
+	else
+		return ret;
+}
+
 static inline int of_property_read_u8(const struct device_node *np,
 				       const char *propname,
 				       u8 *out_value)
-- 
cgit v1.2.3


From 5298d4bfe80f6ae6ae2777bcd1357b0022d98573 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 18 Jan 2022 07:56:14 +0100
Subject: unicode: clean up the Kconfig symbol confusion

Turn the CONFIG_UNICODE symbol into a tristate that generates some always
built in code and remove the confusing CONFIG_UNICODE_UTF8_DATA symbol.

Note that a lot of the IS_ENABLED() checks could be turned from cpp
statements into normal ifs, but this change is intended to be fairly
mechanic, so that should be cleaned up later.

Fixes: 2b3d04787012 ("unicode: Add utf8-data module")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c8510da6cc6d..fdac22d16c2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1490,7 +1490,7 @@ struct super_block {
 #ifdef CONFIG_FS_VERITY
 	const struct fsverity_operations *s_vop;
 #endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
 	struct unicode_map *s_encoding;
 	__u16 s_encoding_flags;
 #endif
-- 
cgit v1.2.3


From c522e3ad296b7b692ed3960dfde467f2a34b434f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 7 Jan 2022 09:28:41 +0000
Subject: fscache: Add a comment explaining how page-release optimisation works

Add a comment into fscache_note_page_release() to explain how the
page-release optimisation logic works[1].  It's not entirely obvious as it
has nothing to do with whether or not the netfs file contains data.

FSCACHE_COOKIE_NO_DATA_TO_READ is set if we have no data in the cache yet
(ie. the backing file lookup was negative, the file is 0 length or the
cookie got invalidated).  It means that we have no data in the cache, not
that the file is necessarily empty on the server.

FSCACHE_COOKIE_HAVE_DATA is set once we've stored data in the backing file.
From that point on, we have data we *could* read - however, it's covered by
pages in the netfs pagecache until at such time one of those covering pages
is released.

So if we've written data to the cache (HAVE_DATA) and there wasn't any data
in the cache when we started (NO_DATA_TO_READ), it may no longer be true
that we can skip reading from the cache.

Read skipping is done by cachefiles_prepare_read().

Note that tracking is not done on a per-page basis, but only on a per-file
basis.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/043a206f03929c2667a465314144e518070a9b2d.camel@kernel.org/ [1]
Link: https://lore.kernel.org/r/164251408479.3435901.9540165422908194636.stgit@warthog.procyon.org.uk/ # v1
---
 include/linux/fscache.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index ede50406bcb0..296c5f1d9f35 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -665,6 +665,11 @@ static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie,
 static inline
 void fscache_note_page_release(struct fscache_cookie *cookie)
 {
+	/* If we've written data to the cache (HAVE_DATA) and there wasn't any
+	 * data in the cache when we started (NO_DATA_TO_READ), it may no
+	 * longer be true that we can skip reading from the cache - so clear
+	 * the flag that causes reads to be skipped.
+	 */
 	if (cookie &&
 	    test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) &&
 	    test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags))
-- 
cgit v1.2.3


From ffa65753c43142f3b803486442813744da71cff2 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 21 Jan 2022 22:10:46 -0800
Subject: mm/migrate.c: rework migration_entry_wait() to not take a pageref

This fixes the FIXME in migrate_vma_check_page().

Before migrating a page migration code will take a reference and check
there are no unexpected page references, failing the migration if there
are.  When a thread faults on a migration entry it will take a temporary
reference to the page to wait for the page to become unlocked signifying
the migration entry has been removed.

This reference is dropped just prior to waiting on the page lock,
however the extra reference can cause migration failures so it is
desirable to avoid taking it.

As migration code already has a reference to the migrating page an extra
reference to wait on PG_locked is unnecessary so long as the reference
can't be dropped whilst setting up the wait.

When faulting on a migration entry the ptl is taken to check the
migration entry.  Removing a migration entry also requires the ptl, and
migration code won't drop its page reference until after the migration
entry has been removed.  Therefore retaining the ptl of a migration
entry is sufficient to ensure the page has a reference.  Reworking
migration_entry_wait() to hold the ptl until the wait setup is complete
means the extra page reference is no longer needed.

[apopple@nvidia.com: v5]
  Link: https://lkml.kernel.org/r/20211213033848.1973946-1-apopple@nvidia.com

Link: https://lkml.kernel.org/r/20211118020754.954425-1-apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 4850cc5bf813..db96e10eb8da 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -40,6 +40,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page, int extra_count);
+void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
+				spinlock_t *ptl);
 void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
 void folio_migrate_copy(struct folio *newfolio, struct folio *folio);
 int folio_migrate_mapping(struct address_space *mapping,
-- 
cgit v1.2.3


From 3ddd9a808cee7284931312f2f3e854c9617f44b2 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:10:50 -0800
Subject: sysctl: add a new register_sysctl_init() interface

Patch series "sysctl: first set of kernel/sysctl cleanups", v2.

Finally had time to respin the series of the work we had started last
year on cleaning up the kernel/sysct.c kitchen sink.  People keeps
stuffing their sysctls in that file and this creates a maintenance
burden.  So this effort is aimed at placing sysctls where they actually
belong.

I'm going to split patches up into series as there is quite a bit of
work.

This first set adds register_sysctl_init() for uses of registerting a
sysctl on the init path, adds const where missing to a few places,
generalizes common values so to be more easy to share, and starts the
move of a few kernel/sysctl.c out where they belong.

The majority of rework on v2 in this first patch set is 0-day fixes.
Eric Biederman's feedback is later addressed in subsequent patch sets.

I'll only post the first two patch sets for now.  We can address the
rest once the first two patch sets get completely reviewed / Acked.

This patch (of 9):

The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

Today though folks heavily rely on tables on kernel/sysctl.c so they can
easily just extend this table with their needed sysctls.  In order to
help users move their sysctls out we need to provide a helper which can
be used during code initialization.

We special-case the initialization use of register_sysctl() since it
*is* safe to fail, given all that sysctls do is provide a dynamic
interface to query or modify at runtime an existing variable.  So the
use case of register_sysctl() on init should *not* stop if the sysctls
don't end up getting registered.  It would be counter productive to stop
boot if a simple sysctl registration failed.

Provide a helper for init then, and document the recommended init levels
to use for callers of this routine.  We will later use this in
subsequent patches to start slimming down kernel/sysctl.c tables and
moving sysctl registration to the code which actually needs these
sysctls.

[mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c                  ]

Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org
Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Qing Wang <wangqing@vivo.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 1fa2b69c6fc3..d3ab7969b6b5 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -199,6 +199,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 void unregister_sysctl_table(struct ctl_table_header * table);
 
 extern int sysctl_init(void);
+extern void __register_sysctl_init(const char *path, struct ctl_table *table,
+				 const char *table_name);
+#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table)
 void do_sysctl_args(void);
 
 extern int pwrsw_enabled;
-- 
cgit v1.2.3


From 78e36f3b0dae586f623c4a37ec5eb5496f5abbe1 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:10:55 -0800
Subject: sysctl: move some boundary constants from sysctl.c to sysctl_vals

sysctl has helpers which let us specify boundary values for a min or max
int value.  Since these are used for a boundary check only they don't
change, so move these variables to sysctl_vals to avoid adding duplicate
variables.  This will help with our cleanup of kernel/sysctl.c.

[akpm@linux-foundation.org: update it for "mm/pagealloc: sysctl: change watermark_scale_factor max limit to 30%"]
[mcgrof@kernel.org: major rebase]

Link: https://lkml.kernel.org/r/20211123202347.818157-3-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Qing Wang <wangqing@vivo.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index d3ab7969b6b5..47cf70c8eb93 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -38,9 +38,16 @@ struct ctl_table_header;
 struct ctl_dir;
 
 /* Keep the same order as in fs/proc/proc_sysctl.c */
-#define SYSCTL_ZERO	((void *)&sysctl_vals[0])
-#define SYSCTL_ONE	((void *)&sysctl_vals[1])
-#define SYSCTL_INT_MAX	((void *)&sysctl_vals[2])
+#define SYSCTL_NEG_ONE			((void *)&sysctl_vals[0])
+#define SYSCTL_ZERO			((void *)&sysctl_vals[1])
+#define SYSCTL_ONE			((void *)&sysctl_vals[2])
+#define SYSCTL_TWO			((void *)&sysctl_vals[3])
+#define SYSCTL_FOUR			((void *)&sysctl_vals[4])
+#define SYSCTL_ONE_HUNDRED		((void *)&sysctl_vals[5])
+#define SYSCTL_TWO_HUNDRED		((void *)&sysctl_vals[6])
+#define SYSCTL_ONE_THOUSAND		((void *)&sysctl_vals[7])
+#define SYSCTL_THREE_THOUSAND		((void *)&sysctl_vals[8])
+#define SYSCTL_INT_MAX			((void *)&sysctl_vals[9])
 
 extern const int sysctl_vals[];
 
-- 
cgit v1.2.3


From bbe7a10ed83a5fa0b0ff6161ecdc4e65a0e9c993 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:11:00 -0800
Subject: hung_task: move hung_task sysctl interface to hung_task.c

The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

So move hung_task sysctl interface to hung_task.c and use
register_sysctl() to register the sysctl interface.

[mcgrof@kernel.org: commit log refresh and fixed 2-3 0day reported compile issues]

Link: https://lkml.kernel.org/r/20211123202347.818157-4-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qing Wang <wangqing@vivo.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/sysctl.h | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 304f431178fd..c19dd5a2c05c 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -7,20 +7,8 @@
 struct ctl_table;
 
 #ifdef CONFIG_DETECT_HUNG_TASK
-
-#ifdef CONFIG_SMP
-extern unsigned int sysctl_hung_task_all_cpu_backtrace;
-#else
-#define sysctl_hung_task_all_cpu_backtrace 0
-#endif /* CONFIG_SMP */
-
-extern int	     sysctl_hung_task_check_count;
-extern unsigned int  sysctl_hung_task_panic;
+/* used for hung_task and block/ */
 extern unsigned long sysctl_hung_task_timeout_secs;
-extern unsigned long sysctl_hung_task_check_interval_secs;
-extern int sysctl_hung_task_warnings;
-int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos);
 #else
 /* Avoid need for ifdefs elsewhere in the code */
 enum { sysctl_hung_task_timeout_secs = 0 };
-- 
cgit v1.2.3


From 86b12b6c5d6b46e64bf2e8080528781032e4bd90 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:11:24 -0800
Subject: aio: move aio sysctl to aio.c

The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

Move aio sysctl to aio.c and use the new register_sysctl_init() to
register the sysctl interface for aio.

[mcgrof@kernel.org: adjust commit log to justify the move]

Link: https://lkml.kernel.org/r/20211123202347.818157-9-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Qing Wang <wangqing@vivo.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/aio.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index b83e68dd006f..86892a4fe7c8 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -20,8 +20,4 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req,
 				       kiocb_cancel_fn *cancel) { }
 #endif /* CONFIG_AIO */
 
-/* for sysctl: */
-extern unsigned long aio_nr;
-extern unsigned long aio_max_nr;
-
 #endif /* __LINUX__AIO_H */
-- 
cgit v1.2.3


From 49a4de75719b6c0f1f375df9908a95cef1e34945 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:11:29 -0800
Subject: dnotify: move dnotify sysctl to dnotify.c

The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

So move dnotify sysctls to dnotify.c and use the new
register_sysctl_init() to register the sysctl interface.

[mcgrof@kernel.org: adjust the commit log to justify the move]

Link: https://lkml.kernel.org/r/20211123202347.818157-10-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Qing Wang <wangqing@vivo.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dnotify.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h
index b87c3b85a166..b1d26f9f1c9f 100644
--- a/include/linux/dnotify.h
+++ b/include/linux/dnotify.h
@@ -29,7 +29,6 @@ struct dnotify_struct {
 			    FS_CREATE | FS_RENAME |\
 			    FS_MOVED_FROM | FS_MOVED_TO)
 
-extern int dir_notify_enable;
 extern void dnotify_flush(struct file *, fl_owner_t);
 extern int fcntl_dirnotify(int, struct file *, unsigned long);
 
-- 
cgit v1.2.3


From 7b9ad122b52c9839e1f68f16c907990a6ad6f793 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:11:59 -0800
Subject: inotify: simplify subdirectory registration with register_sysctl()

There is no need to user boiler plate code to specify a set of base
directories we're going to stuff sysctls under.  Simplify this by using
register_sysctl() and specifying the directory path directly.

Move inotify_user sysctl to inotify_user.c while at it to remove clutter
from kernel/sysctl.c.

[mcgrof@kernel.org: remember to register fanotify_table]
  Link: https://lkml.kernel.org/r/YZ5A6iWLb0h3N3RC@bombadil.infradead.org
[mcgrof@kernel.org: update commit log to reflect new path we decided to take]

Link: https://lkml.kernel.org/r/20211123202422.819032-7-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Qing Wang <wangqing@vivo.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fanotify.h | 2 --
 include/linux/inotify.h  | 3 ---
 2 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 3afdf339d53c..419cadcd7ff5 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -5,8 +5,6 @@
 #include <linux/sysctl.h>
 #include <uapi/linux/fanotify.h>
 
-extern struct ctl_table fanotify_table[]; /* for sysctl */
-
 #define FAN_GROUP_FLAG(group, flag) \
 	((group)->fanotify_data.flags & (flag))
 
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 6a24905f6e1e..8d20caa1b268 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -7,11 +7,8 @@
 #ifndef _LINUX_INOTIFY_H
 #define _LINUX_INOTIFY_H
 
-#include <linux/sysctl.h>
 #include <uapi/linux/inotify.h>
 
-extern struct ctl_table inotify_table[]; /* for sysctl */
-
 #define ALL_INOTIFY_BITS (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \
 			  IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | \
 			  IN_MOVED_TO | IN_CREATE | IN_DELETE | \
-- 
cgit v1.2.3


From a8f5de894f76f1c73f4a068d04897a5e2f873825 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:12:09 -0800
Subject: eventpoll: simplify sysctl declaration with register_sysctl()

The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

So move the epoll_table sysctl to fs/eventpoll.c and use
register_sysctl().

Link: https://lkml.kernel.org/r/20211123202422.819032-9-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Qing Wang <wangqing@vivo.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/poll.h   | 2 --
 include/linux/sysctl.h | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/poll.h b/include/linux/poll.h
index 1cdc32b1f1b0..a9e0e1c2d1f2 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -8,12 +8,10 @@
 #include <linux/wait.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/sysctl.h>
 #include <linux/uaccess.h>
 #include <uapi/linux/poll.h>
 #include <uapi/linux/eventpoll.h>
 
-extern struct ctl_table epoll_table[]; /* for sysctl */
 /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
    additional memory. */
 #ifdef __clang__
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 47cf70c8eb93..6dd0f277f844 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -219,7 +219,6 @@ extern int no_unaligned_warning;
 extern struct ctl_table sysctl_mount_point[];
 extern struct ctl_table random_table[];
 extern struct ctl_table firmware_config_table[];
-extern struct ctl_table epoll_table[];
 
 #else /* CONFIG_SYSCTL */
 static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
-- 
cgit v1.2.3


From 6aad36d421d8bfe156508fa4edfe67827234cf0f Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:12:13 -0800
Subject: firmware_loader: move firmware sysctl to its own files

Patch series "sysctl: 3rd set of kernel/sysctl cleanups", v2.

This is the third set of patches to help address cleaning the kitchen
seink in kernel/sysctl.c and to move sysctls away to where they are
actually implemented / used.

This patch (of 8):

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

So move the firmware configuration sysctl table to the only place where
it is used, and make it clear that if sysctls are disabled this is not
used.

[akpm@linux-foundation.org: export register_firmware_config_sysctl and unregister_firmware_config_sysctl to modules]
[akpm@linux-foundation.org: use EXPORT_SYMBOL_NS_GPL instead]
[sfr@canb.auug.org.au: fix that so it compiles]
  Link: https://lkml.kernel.org/r/20211201160626.401d828d@canb.auug.org.au
[mcgrof@kernel.org: major commit log update to justify the move]

Link: https://lkml.kernel.org/r/20211124231435.1445213-1-mcgrof@kernel.org
Link: https://lkml.kernel.org/r/20211124231435.1445213-2-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Qing Wang <wangqing@vivo.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 6dd0f277f844..3985e9c80155 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -218,7 +218,6 @@ extern int no_unaligned_warning;
 
 extern struct ctl_table sysctl_mount_point[];
 extern struct ctl_table random_table[];
-extern struct ctl_table firmware_config_table[];
 
 #else /* CONFIG_SYSCTL */
 static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
-- 
cgit v1.2.3


From 5475e8f03c80bbce7b43a57d861f5acc44a60b22 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:12:18 -0800
Subject: random: move the random sysctl declarations to its own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

So move the random sysctls to their own file and use
register_sysctl_init().

[mcgrof@kernel.org: commit log update to justify the move]

Link: https://lkml.kernel.org/r/20211124231435.1445213-3-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Qing Wang <wangqing@vivo.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 3985e9c80155..fce05a060bc5 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -217,7 +217,6 @@ extern int unaligned_dump_stack;
 extern int no_unaligned_warning;
 
 extern struct ctl_table sysctl_mount_point[];
-extern struct ctl_table random_table[];
 
 #else /* CONFIG_SYSCTL */
 static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
-- 
cgit v1.2.3


From ee9efac48a082904d17a20131aa73d82f058cdd6 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:12:23 -0800
Subject: sysctl: add helper to register a sysctl mount point

The way to create a subdirectory on top of sysctl_mount_point is a bit
obscure, and *why* we do that even so more.  Provide a helper which
makes it clear why we do this.

[akpm@linux-foundation.org: export register_sysctl_mount_point() to
modules]

Link: https://lkml.kernel.org/r/20211124231435.1445213-4-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Suggested-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Qing Wang <wangqing@vivo.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index fce05a060bc5..746c098a6ff5 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -209,6 +209,8 @@ extern int sysctl_init(void);
 extern void __register_sysctl_init(const char *path, struct ctl_table *table,
 				 const char *table_name);
 #define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table)
+extern struct ctl_table_header *register_sysctl_mount_point(const char *path);
+
 void do_sysctl_args(void);
 
 extern int pwrsw_enabled;
@@ -224,6 +226,11 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table *
 	return NULL;
 }
 
+static inline struct sysctl_header *register_sysctl_mount_point(const char *path)
+{
+	return NULL;
+}
+
 static inline struct ctl_table_header *register_sysctl_paths(
 			const struct ctl_path *path, struct ctl_table *table)
 {
-- 
cgit v1.2.3


From 0df8bdd5e3b3e557ce2c2575fce0c64c5dd1045a Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:12:43 -0800
Subject: stackleak: move stack_erasing sysctl to stackleak.c

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

So move the stack_erasing sysctl from kernel/sysctl.c to
kernel/stackleak.c and use register_sysctl() to register the sysctl
interface.

[mcgrof@kernel.org: commit log update]

Link: https://lkml.kernel.org/r/20211124231435.1445213-8-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Qing Wang <wangqing@vivo.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/stackleak.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h
index a59db2f08e76..ccaab2043fcd 100644
--- a/include/linux/stackleak.h
+++ b/include/linux/stackleak.h
@@ -23,11 +23,6 @@ static inline void stackleak_task_init(struct task_struct *t)
 # endif
 }
 
-#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
-int stack_erasing_sysctl(struct ctl_table *table, int write,
-			void *buffer, size_t *lenp, loff_t *ppos);
-#endif
-
 #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */
 static inline void stackleak_task_init(struct task_struct *t) { }
 #endif
-- 
cgit v1.2.3


From b1f2aff888af54a057c2c3c0d88a13ef5d37b52a Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:12:48 -0800
Subject: sysctl: share unsigned long const values

Provide a way to share unsigned long values.  This will allow others to
not have to re-invent these values.

Link: https://lkml.kernel.org/r/20211124231435.1445213-9-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Qing Wang <wangqing@vivo.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 746c098a6ff5..2de6d20d191b 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -51,6 +51,12 @@ struct ctl_dir;
 
 extern const int sysctl_vals[];
 
+#define SYSCTL_LONG_ZERO	((void *)&sysctl_long_vals[0])
+#define SYSCTL_LONG_ONE		((void *)&sysctl_long_vals[1])
+#define SYSCTL_LONG_MAX		((void *)&sysctl_long_vals[2])
+
+extern const unsigned long sysctl_long_vals[];
+
 typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
 
-- 
cgit v1.2.3


From 1d67fe585049d3e2448b997af78c68cbf90ada09 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:12:52 -0800
Subject: fs: move inode sysctls to its own file

Patch series "sysctl: 4th set of kernel/sysctl cleanups".

This is slimming down the fs uses of kernel/sysctl.c to the point that
the next step is to just get rid of the fs base directory for it and
move that elsehwere, so that next patch series starts dealing with that
to demo how we can end up cleaning up a full base directory from
kernel/sysctl.c, one at a time.

This patch (of 9):

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

So move the inode sysctls to its own file.  Since we are no longer using
this outside of fs/ remove the extern declaration of its respective proc
helper.

We use early_initcall() as it is the earliest we can use.

[arnd@arndb.de: avoid unused-variable warning]
  Link: https://lkml.kernel.org/r/20211203190123.874239-1-arnd@kernel.org

Link: https://lkml.kernel.org/r/20211129205548.605569-1-mcgrof@kernel.org
Link: https://lkml.kernel.org/r/20211129205548.605569-2-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c8510da6cc6d..044de67c8167 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -82,7 +82,6 @@ extern void __init files_maxfiles_init(void);
 extern struct files_stat_struct files_stat;
 extern unsigned long get_max_files(void);
 extern unsigned int sysctl_nr_open;
-extern struct inodes_stat_t inodes_stat;
 extern int leases_enable, lease_break_time;
 extern int sysctl_protected_symlinks;
 extern int sysctl_protected_hardlinks;
@@ -3537,8 +3536,6 @@ int proc_nr_files(struct ctl_table *table, int write,
 		  void *buffer, size_t *lenp, loff_t *ppos);
 int proc_nr_dentry(struct ctl_table *table, int write,
 		  void *buffer, size_t *lenp, loff_t *ppos);
-int proc_nr_inodes(struct ctl_table *table, int write,
-		   void *buffer, size_t *lenp, loff_t *ppos);
 int __init list_bdev_fs_names(char *buf, size_t size);
 
 #define __FMODE_EXEC		((__force int) FMODE_EXEC)
-- 
cgit v1.2.3


From 204d5a24e15562b2816825c0f9b49d26814b77be Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:12:56 -0800
Subject: fs: move fs stat sysctls to file_table.c

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

We can create the sysctl dynamically on early init for fs stat to help
with this clutter.  This dusts off the fs stat syctls knobs and puts
them into where they are declared.

Link: https://lkml.kernel.org/r/20211129205548.605569-3-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 044de67c8167..1e6761966120 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -79,7 +79,6 @@ extern void __init inode_init_early(void);
 extern void __init files_init(void);
 extern void __init files_maxfiles_init(void);
 
-extern struct files_stat_struct files_stat;
 extern unsigned long get_max_files(void);
 extern unsigned int sysctl_nr_open;
 extern int leases_enable, lease_break_time;
@@ -3532,8 +3531,6 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
 			  size_t len, loff_t *ppos);
 
 struct ctl_table;
-int proc_nr_files(struct ctl_table *table, int write,
-		  void *buffer, size_t *lenp, loff_t *ppos);
 int proc_nr_dentry(struct ctl_table *table, int write,
 		  void *buffer, size_t *lenp, loff_t *ppos);
 int __init list_bdev_fs_names(char *buf, size_t size);
-- 
cgit v1.2.3


From c8c0c239d5ab1e3e8d2bb0453ce642fe2c6357ec Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:12:59 -0800
Subject: fs: move dcache sysctls to its own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

So move the dcache sysctl clutter out of kernel/sysctl.c.  This is a
small one-off entry, perhaps later we can simplify this representation,
but for now we use the helpers we have.  We won't know how we can
simplify this further untl we're fully done with the cleanup.

[arnd@arndb.de: avoid unused-function warning]
  Link: https://lkml.kernel.org/r/20211203190123.874239-2-arnd@kernel.org

Link: https://lkml.kernel.org/r/20211129205548.605569-4-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dcache.h | 10 ----------
 include/linux/fs.h     |  2 --
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 9e23d33bb6f1..f5bba51480b2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -61,16 +61,6 @@ extern const struct qstr empty_name;
 extern const struct qstr slash_name;
 extern const struct qstr dotdot_name;
 
-struct dentry_stat_t {
-	long nr_dentry;
-	long nr_unused;
-	long age_limit;		/* age in seconds */
-	long want_pages;	/* pages requested by system */
-	long nr_negative;	/* # of unused negative dentries */
-	long dummy;		/* Reserved for future use */
-};
-extern struct dentry_stat_t dentry_stat;
-
 /*
  * Try to keep struct dentry aligned on 64 byte cachelines (this will
  * give reasonable cacheline footprint with larger lines without the
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1e6761966120..9b856d5da9e2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3531,8 +3531,6 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
 			  size_t len, loff_t *ppos);
 
 struct ctl_table;
-int proc_nr_dentry(struct ctl_table *table, int write,
-		  void *buffer, size_t *lenp, loff_t *ppos);
 int __init list_bdev_fs_names(char *buf, size_t size);
 
 #define __FMODE_EXEC		((__force int) FMODE_EXEC)
-- 
cgit v1.2.3


From 54771613e8a7dbbba2a205ddf1b33e25a290b3fd Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:13:03 -0800
Subject: sysctl: move maxolduid as a sysctl specific const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The maxolduid value is only shared for sysctl purposes for use on a max
range.  Just stuff this into our shared const array.

[akpm@linux-foundation.org: fix sysctl_vals[], per Mickaël]

Link: https://lkml.kernel.org/r/20211129205548.605569-5-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 2de6d20d191b..bb921eb8a02d 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -49,6 +49,9 @@ struct ctl_dir;
 #define SYSCTL_THREE_THOUSAND		((void *)&sysctl_vals[8])
 #define SYSCTL_INT_MAX			((void *)&sysctl_vals[9])
 
+/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+#define SYSCTL_MAXOLDUID		((void *)&sysctl_vals[10])
+
 extern const int sysctl_vals[];
 
 #define SYSCTL_LONG_ZERO	((void *)&sysctl_long_vals[0])
-- 
cgit v1.2.3


From dd81faa88340a1fe8cd81c8ecbadd8e95c58549c Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:13:10 -0800
Subject: fs: move locking sysctls where they are used

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

The locking fs sysctls are only used on fs/locks.c, so move them there.

Link: https://lkml.kernel.org/r/20211129205548.605569-7-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9b856d5da9e2..0e08c3dd8f75 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -82,10 +82,6 @@ extern void __init files_maxfiles_init(void);
 extern unsigned long get_max_files(void);
 extern unsigned int sysctl_nr_open;
 extern int leases_enable, lease_break_time;
-extern int sysctl_protected_symlinks;
-extern int sysctl_protected_hardlinks;
-extern int sysctl_protected_fifos;
-extern int sysctl_protected_regular;
 
 typedef __kernel_rwf_t rwf_t;
 
-- 
cgit v1.2.3


From 9c011be132972ff94bde2ae99064e29f94e85c68 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:13:13 -0800
Subject: fs: move namei sysctls to its own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

So move namei's own sysctl knobs to its own file.

Other than the move we also avoid initializing two static variables to 0
as this is not needed:

  * sysctl_protected_symlinks
  * sysctl_protected_hardlinks

Link: https://lkml.kernel.org/r/20211129205548.605569-8-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0e08c3dd8f75..9617dea24978 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -81,7 +81,6 @@ extern void __init files_maxfiles_init(void);
 
 extern unsigned long get_max_files(void);
 extern unsigned int sysctl_nr_open;
-extern int leases_enable, lease_break_time;
 
 typedef __kernel_rwf_t rwf_t;
 
-- 
cgit v1.2.3


From 1998f19324d24df7de4e74d81503b4299eb99e7d Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:13:20 -0800
Subject: fs: move pipe sysctls to is own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

So move the pipe sysctls to its own file.

Link: https://lkml.kernel.org/r/20211129205548.605569-10-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pipe_fs_i.h | 4 ----
 include/linux/sysctl.h    | 6 ++++++
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index fc5642431b92..c00c618ef290 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -238,10 +238,6 @@ void pipe_lock(struct pipe_inode_info *);
 void pipe_unlock(struct pipe_inode_info *);
 void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
 
-extern unsigned int pipe_max_size;
-extern unsigned long pipe_user_pages_hard;
-extern unsigned long pipe_user_pages_soft;
-
 /* Wait for a pipe to be readable/writable while dropping the pipe lock */
 void pipe_wait_readable(struct pipe_inode_info *);
 void pipe_wait_writable(struct pipe_inode_info *);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index bb921eb8a02d..4294e9668bd5 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -221,6 +221,12 @@ extern void __register_sysctl_init(const char *path, struct ctl_table *table,
 extern struct ctl_table_header *register_sysctl_mount_point(const char *path);
 
 void do_sysctl_args(void);
+int do_proc_douintvec(struct ctl_table *table, int write,
+		      void *buffer, size_t *lenp, loff_t *ppos,
+		      int (*conv)(unsigned long *lvalp,
+				  unsigned int *valp,
+				  int write, void *data),
+		      void *data);
 
 extern int pwrsw_enabled;
 extern int unaligned_enabled;
-- 
cgit v1.2.3


From 51cb8dfc5a5c39e6c70376b9dc9a14d624a9d271 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:13:24 -0800
Subject: sysctl: add and use base directory declarer and registration helper

Patch series "sysctl: add and use base directory declarer and
registration helper".

In this patch series we start addressing base directories, and so we
start with the "fs" sysctls.  The end goal is we end up completely
moving all "fs" sysctl knobs out from kernel/sysctl.

This patch (of 6):

Add a set of helpers which can be used to declare and register base
directory sysctls on their own.  We do this so we can later move each of
the base sysctl directories like "fs", "kernel", etc, to their own
respective files instead of shoving the declarations and registrations
all on kernel/sysctl.c.  The lazy approach has caught up and with this,
we just end up extending the list of base directories / sysctls on one
file and this makes maintenance difficult due to merge conflicts from
many developers.

The declarations are used first by kernel/sysctl.c for registration its
own base which over time we'll try to clean up.  It will be used in the
next patch to demonstrate how to cleanly deal with base sysctl
directories.

[mcgrof@kernel.org: null-terminate the ctl_table arrays]
  Link: https://lkml.kernel.org/r/YafJY3rXDYnjK/gs@bombadil.infradead.org

Link: https://lkml.kernel.org/r/20211129211943.640266-1-mcgrof@kernel.org
Link: https://lkml.kernel.org/r/20211129211943.640266-2-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4294e9668bd5..02134a8abad7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -194,6 +194,20 @@ struct ctl_path {
 
 #ifdef CONFIG_SYSCTL
 
+#define DECLARE_SYSCTL_BASE(_name, _table)				\
+static struct ctl_table _name##_base_table[] = {			\
+	{								\
+		.procname	= #_name,				\
+		.mode		= 0555,					\
+		.child		= _table,				\
+	},								\
+	{ },								\
+}
+
+extern int __register_sysctl_base(struct ctl_table *base_table);
+
+#define register_sysctl_base(_name) __register_sysctl_base(_name##_base_table)
+
 void proc_sys_poll_notify(struct ctl_table_poll *poll);
 
 extern void setup_sysctl_set(struct ctl_table_set *p,
@@ -236,6 +250,16 @@ extern int no_unaligned_warning;
 extern struct ctl_table sysctl_mount_point[];
 
 #else /* CONFIG_SYSCTL */
+
+#define DECLARE_SYSCTL_BASE(_name, _table)
+
+static inline int __register_sysctl_base(struct ctl_table *base_table)
+{
+	return 0;
+}
+
+#define register_sysctl_base(table) __register_sysctl_base(table)
+
 static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
 {
 	return NULL;
-- 
cgit v1.2.3


From ab171b952c6e065779687b44041038efdadb3915 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:13:27 -0800
Subject: fs: move namespace sysctls and declare fs base directory

This moves the namespace sysctls to its own file as part of the
kernel/sysctl.c spring cleaning

Since we have now removed all sysctls for "fs", we now have to declare
it on the filesystem code, we do that using the new helper, which
reduces boiler plate code.

We rename init_fs_shared_sysctls() to init_fs_sysctls() to reflect that
now fs/sysctls.c is taking on the burden of being the first to register
the base directory as well.

Lastly, since init code will load in the order in which we link it we
have to move the sysctl code to be linked in early, so that its early
init routine runs prior to other fs code.  This way, other filesystem
code can register their own sysctls using the helpers after this:

  * register_sysctl_init()
  * register_sysctl()

Link: https://lkml.kernel.org/r/20211129211943.640266-3-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mount.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mount.h b/include/linux/mount.h
index 5d92a7e1a742..7f18a7555dff 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -113,9 +113,6 @@ extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
 
 extern dev_t name_to_dev_t(const char *name);
-
-extern unsigned int sysctl_mount_max;
-
 extern bool path_is_mountpoint(const struct path *path);
 
 extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num);
-- 
cgit v1.2.3


From d8c0418aac78e661b5283c9d6a1dfc61d44f26fd Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 21 Jan 2022 22:13:31 -0800
Subject: kernel/sysctl.c: rename sysctl_init() to sysctl_init_bases()

Rename sysctl_init() to sysctl_init_bases() so to reflect exactly what
this is doing.

Link: https://lkml.kernel.org/r/20211129211943.640266-4-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 02134a8abad7..180adf7da785 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -228,7 +228,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 
 void unregister_sysctl_table(struct ctl_table_header * table);
 
-extern int sysctl_init(void);
+extern int sysctl_init_bases(void);
 extern void __register_sysctl_init(const char *path, struct ctl_table *table,
 				 const char *table_name);
 #define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table)
-- 
cgit v1.2.3


From fdcd4073fccc6f989308be3f1d61d8a68cd990ce Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:13:34 -0800
Subject: printk: fix build warning when CONFIG_PRINTK=n

build warning when CONFIG_PRINTK=n

	kernel/printk/printk.c:175:5: warning: no previous prototype for
	 'devkmsg_sysctl_set_loglvl' [-Wmissing-prototypes]

devkmsg_sysctl_set_loglvl() is only used in sysctl.c when
CONFIG_PRINTK=y, but it participates in the build when CONFIG_PRINTK=n.
So add compile dependency CONFIG_PRINTK=y && CONFIG_SYSCTL=y to fix the
build warning.

Link: https://lkml.kernel.org/r/20211129211943.640266-5-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Stephen Kitt <steve@sk2.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9497f6b98339..1522df223c0f 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -183,10 +183,6 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 extern int printk_delay_msec;
 extern int dmesg_restrict;
 
-extern int
-devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf,
-			  size_t *lenp, loff_t *ppos);
-
 extern void wake_up_klogd(void);
 
 char *log_buf_addr_get(void);
-- 
cgit v1.2.3


From f0bc21b268c1464603192a00851cdbbf7c2cdc36 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:13:38 -0800
Subject: fs/coredump: move coredump sysctls into its own file

This moves the fs/coredump.c respective sysctls to its own file.

Link: https://lkml.kernel.org/r/20211129211943.640266-6-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Stephen Kitt <steve@sk2.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/coredump.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 78fcd776b185..248a68c668b4 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -14,10 +14,6 @@ struct core_vma_metadata {
 	unsigned long dump_size;
 };
 
-extern int core_uses_pid;
-extern char core_pattern[];
-extern unsigned int core_pipe_limit;
-
 /*
  * These are the only things you should do on a core-file: use only these
  * functions to write out all the necessary info.
@@ -37,4 +33,10 @@ extern void do_coredump(const kernel_siginfo_t *siginfo);
 static inline void do_coredump(const kernel_siginfo_t *siginfo) {}
 #endif
 
+#if defined(CONFIG_COREDUMP) && defined(CONFIG_SYSCTL)
+extern void validate_coredump_safety(void);
+#else
+static inline void validate_coredump_safety(void) {}
+#endif
+
 #endif /* _LINUX_COREDUMP_H */
-- 
cgit v1.2.3


From a737a3c6744bc822d1e6a837fef550e665ddf877 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 21 Jan 2022 22:13:41 -0800
Subject: kprobe: move sysctl_kprobes_optimization to kprobes.c

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

Move sysctl_kprobes_optimization from kernel/sysctl.c to
kernel/kprobes.c.  Use register_sysctl() to register the sysctl
interface.

[mcgrof@kernel.org: fix compile issue when CONFIG_OPTPROBES is disabled]

Link: https://lkml.kernel.org/r/20211129211943.640266-7-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Stephen Kitt <steve@sk2.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kprobes.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 8c8f7a4d93af..19b884353b15 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -348,12 +348,6 @@ extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs);
 
 DEFINE_INSN_CACHE_OPS(optinsn);
 
-#ifdef CONFIG_SYSCTL
-extern int sysctl_kprobes_optimization;
-extern int proc_kprobes_optimization_handler(struct ctl_table *table,
-					     int write, void *buffer,
-					     size_t *length, loff_t *ppos);
-#endif /* CONFIG_SYSCTL */
 extern void wait_for_kprobe_optimizer(void);
 #else /* !CONFIG_OPTPROBES */
 static inline void wait_for_kprobe_optimizer(void) { }
-- 
cgit v1.2.3


From 4a57d6bbaecd28c8175dc5da013009e4158018c2 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Fri, 21 Jan 2022 22:14:10 -0800
Subject: locking/rwlocks: introduce write_lock_nested

In preparation for converting bit_spin_lock to rwlock in zsmalloc so
that multiple writers of zspages can run at the same time but those
zspages are supposed to be different zspage instance.  Thus, it's not
deadlock.  This patch adds write_lock_nested to support the case for
LOCKDEP.

[minchan@kernel.org: fix write_lock_nested for RT]
  Link: https://lkml.kernel.org/r/YZfrMTAXV56HFWJY@google.com
[bigeasy@linutronix.de: fixup write_lock_nested() implementation]
  Link: https://lkml.kernel.org/r/20211123170134.y6xb7pmpgdn4m3bn@linutronix.de

Link: https://lkml.kernel.org/r/20211115185909.3949505-8-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rwlock.h          |  6 ++++++
 include/linux/rwlock_api_smp.h  |  8 ++++++++
 include/linux/rwlock_rt.h       | 10 ++++++++++
 include/linux/spinlock_api_up.h |  1 +
 4 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index 2c0ad417ce3c..8f416c5e929e 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -55,6 +55,12 @@ do {								\
 #define write_lock(lock)	_raw_write_lock(lock)
 #define read_lock(lock)		_raw_read_lock(lock)
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define write_lock_nested(lock, subclass)	_raw_write_lock_nested(lock, subclass)
+#else
+#define write_lock_nested(lock, subclass)	_raw_write_lock(lock)
+#endif
+
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 
 #define read_lock_irqsave(lock, flags)			\
diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
index f1db6f17c4fb..dceb0a59b692 100644
--- a/include/linux/rwlock_api_smp.h
+++ b/include/linux/rwlock_api_smp.h
@@ -17,6 +17,7 @@
 
 void __lockfunc _raw_read_lock(rwlock_t *lock)		__acquires(lock);
 void __lockfunc _raw_write_lock(rwlock_t *lock)		__acquires(lock);
+void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass)	__acquires(lock);
 void __lockfunc _raw_read_lock_bh(rwlock_t *lock)	__acquires(lock);
 void __lockfunc _raw_write_lock_bh(rwlock_t *lock)	__acquires(lock);
 void __lockfunc _raw_read_lock_irq(rwlock_t *lock)	__acquires(lock);
@@ -209,6 +210,13 @@ static inline void __raw_write_lock(rwlock_t *lock)
 	LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
 }
 
+static inline void __raw_write_lock_nested(rwlock_t *lock, int subclass)
+{
+	preempt_disable();
+	rwlock_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
+}
+
 #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */
 
 static inline void __raw_write_unlock(rwlock_t *lock)
diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
index 49c1f3842ed5..8544ff05e594 100644
--- a/include/linux/rwlock_rt.h
+++ b/include/linux/rwlock_rt.h
@@ -28,6 +28,7 @@ extern void rt_read_lock(rwlock_t *rwlock);
 extern int rt_read_trylock(rwlock_t *rwlock);
 extern void rt_read_unlock(rwlock_t *rwlock);
 extern void rt_write_lock(rwlock_t *rwlock);
+extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass);
 extern int rt_write_trylock(rwlock_t *rwlock);
 extern void rt_write_unlock(rwlock_t *rwlock);
 
@@ -83,6 +84,15 @@ static __always_inline void write_lock(rwlock_t *rwlock)
 	rt_write_lock(rwlock);
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static __always_inline void write_lock_nested(rwlock_t *rwlock, int subclass)
+{
+	rt_write_lock_nested(rwlock, subclass);
+}
+#else
+#define write_lock_nested(lock, subclass)	rt_write_lock(((void)(subclass), (lock)))
+#endif
+
 static __always_inline void write_lock_bh(rwlock_t *rwlock)
 {
 	local_bh_disable();
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index d0d188861ad6..b8ba00ccccde 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -59,6 +59,7 @@
 #define _raw_spin_lock_nested(lock, subclass)	__LOCK(lock)
 #define _raw_read_lock(lock)			__LOCK(lock)
 #define _raw_write_lock(lock)			__LOCK(lock)
+#define _raw_write_lock_nested(lock, subclass)	__LOCK(lock)
 #define _raw_spin_lock_bh(lock)			__LOCK_BH(lock)
 #define _raw_read_lock_bh(lock)			__LOCK_BH(lock)
 #define _raw_write_lock_bh(lock)		__LOCK_BH(lock)
-- 
cgit v1.2.3


From 6dfbbae14a7b961f41d80a106e1ab60e86d061c5 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Fri, 21 Jan 2022 22:14:20 -0800
Subject: fs: proc: store PDE()->data into inode->i_private

PDE_DATA(inode) is introduced to get user private data and hide the
layout of struct proc_dir_entry.  The inode->i_private is used to do the
same thing as well.  Save a copy of user private data to inode->
i_private when proc inode is allocated.  This means the user also can
get their private data by inode->i_private.

Introduce pde_data() to wrap inode->i_private so that we can remove
PDE_DATA() from fs/proc/generic.c and make PTE_DATE() as a wrapper of
pde_data().  It will be easier if we decide to remove PDE_DATE() in the
future.

Link: https://lkml.kernel.org/r/20211124081956.87711-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alexey Gladkov <gladkov.alexey@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/proc_fs.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 01b9268451a8..b6e7005cc1b2 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -110,7 +110,18 @@ extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
 struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops);
 extern void proc_set_size(struct proc_dir_entry *, loff_t);
 extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
-extern void *PDE_DATA(const struct inode *);
+
+/*
+ * Obtain the private data passed by user through proc_create_data() or
+ * related.
+ */
+static inline void *pde_data(const struct inode *inode)
+{
+	return inode->i_private;
+}
+
+#define PDE_DATA(i)	pde_data(i)
+
 extern void *proc_get_parent_data(const struct inode *);
 extern void proc_remove(struct proc_dir_entry *);
 extern void remove_proc_entry(const char *, struct proc_dir_entry *);
-- 
cgit v1.2.3


From 359745d78351c6f5442435f81549f0207ece28aa Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Fri, 21 Jan 2022 22:14:23 -0800
Subject: proc: remove PDE_DATA() completely

Remove PDE_DATA() completely and replace it with pde_data().

[akpm@linux-foundation.org: fix naming clash in drivers/nubus/proc.c]
[akpm@linux-foundation.org: now fix it properly]

Link: https://lkml.kernel.org/r/20211124081956.87711-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alexey Gladkov <gladkov.alexey@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/proc_fs.h  | 4 +---
 include/linux/seq_file.h | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index b6e7005cc1b2..81d6e4ec2294 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -120,8 +120,6 @@ static inline void *pde_data(const struct inode *inode)
 	return inode->i_private;
 }
 
-#define PDE_DATA(i)	pde_data(i)
-
 extern void *proc_get_parent_data(const struct inode *);
 extern void proc_remove(struct proc_dir_entry *);
 extern void remove_proc_entry(const char *, struct proc_dir_entry *);
@@ -202,7 +200,7 @@ proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent,
 
 static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {}
 static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {}
-static inline void *PDE_DATA(const struct inode *inode) {BUG(); return NULL;}
+static inline void *pde_data(const struct inode *inode) {BUG(); return NULL;}
 static inline void *proc_get_parent_data(const struct inode *inode) { BUG(); return NULL; }
 
 static inline void proc_remove(struct proc_dir_entry *de) {}
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 72dbb44a4573..88cc16444b43 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -209,7 +209,7 @@ static const struct file_operations __name ## _fops = {			\
 #define DEFINE_PROC_SHOW_ATTRIBUTE(__name)				\
 static int __name ## _open(struct inode *inode, struct file *file)	\
 {									\
-	return single_open(file, __name ## _show, PDE_DATA(inode));	\
+	return single_open(file, __name ## _show, pde_data(inode));	\
 }									\
 									\
 static const struct proc_ops __name ## _proc_ops = {			\
-- 
cgit v1.2.3


From 2dba5eb1c73b6ba2988ced07250edeac0f8cbf5a Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 21 Jan 2022 22:14:27 -0800
Subject: lib/stackdepot: allow optional init and stack_table allocation by
 kvmalloc()

Currently, enabling CONFIG_STACKDEPOT means its stack_table will be
allocated from memblock, even if stack depot ends up not actually used.
The default size of stack_table is 4MB on 32-bit, 8MB on 64-bit.

This is fine for use-cases such as KASAN which is also a config option
and has overhead on its own.  But it's an issue for functionality that
has to be actually enabled on boot (page_owner) or depends on hardware
(GPU drivers) and thus the memory might be wasted.  This was raised as
an issue [1] when attempting to add stackdepot support for SLUB's debug
object tracking functionality.  It's common to build kernels with
CONFIG_SLUB_DEBUG and enable slub_debug on boot only when needed, or
create only specific kmem caches with debugging for testing purposes.

It would thus be more efficient if stackdepot's table was allocated only
when actually going to be used.  This patch thus makes the allocation
(and whole stack_depot_init() call) optional:

 - Add a CONFIG_STACKDEPOT_ALWAYS_INIT flag to keep using the current
   well-defined point of allocation as part of mem_init(). Make
   CONFIG_KASAN select this flag.

 - Other users have to call stack_depot_init() as part of their own init
   when it's determined that stack depot will actually be used. This may
   depend on both config and runtime conditions. Convert current users
   which are page_owner and several in the DRM subsystem. Same will be
   done for SLUB later.

 - Because the init might now be called after the boot-time memblock
   allocation has given all memory to the buddy allocator, change
   stack_depot_init() to allocate stack_table with kvmalloc() when
   memblock is no longer available. Also handle allocation failure by
   disabling stackdepot (could have theoretically happened even with
   memblock allocation previously), and don't unnecessarily align the
   memblock allocation to its own size anymore.

[1] https://lore.kernel.org/all/CAMuHMdW=eoVzM1Re5FVoEN87nKfiLmM2+Ah7eNu2KXEhCvbZyA@mail.gmail.com/

Link: https://lkml.kernel.org/r/20211013073005.11351-1-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Marco Elver <elver@google.com> # stackdepot
Cc: Marco Elver <elver@google.com>
Cc: Vijayanand Jitta <vjitta@codeaurora.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Oliver Glitta <glittao@gmail.com>
Cc: Imran Khan <imran.f.khan@oracle.com>
From: Colin Ian King <colin.king@canonical.com>
Subject: lib/stackdepot: fix spelling mistake and grammar in pr_err message

There is a spelling mistake of the work allocation so fix this and
re-phrase the message to make it easier to read.

Link: https://lkml.kernel.org/r/20211015104159.11282-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
From: Vlastimil Babka <vbabka@suse.cz>
Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup

On FLATMEM, we call page_ext_init_flatmem_late() just before
kmem_cache_init() which means stack_depot_init() (called by page owner
init) will not recognize properly it should use kvmalloc() and not
memblock_alloc().  memblock_alloc() will also not issue a warning and
return a block memory that can be invalid and cause kernel page fault when
saving stacks, as reported by the kernel test robot [1].

Fix this by moving page_ext_init_flatmem_late() below kmem_cache_init() so
that slab_is_available() is true during stack_depot_init().  SPARSEMEM
doesn't have this issue, as it doesn't do page_ext_init_flatmem_late(),
but a different page_ext_init() even later in the boot process.

Thanks to Mike Rapoport for pointing out the FLATMEM init ordering issue.

While at it, also actually resolve a checkpatch warning in stack_depot_init()
from DRM CI, which was supposed to be in the original patch already.

[1] https://lore.kernel.org/all/20211014085450.GC18719@xsang-OptiPlex-9020/

Link: https://lkml.kernel.org/r/6abd9213-19a9-6d58-cedc-2414386d2d81@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: kernel test robot <oliver.sang@intel.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
From: Vlastimil Babka <vbabka@suse.cz>
Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup3

Due to cd06ab2fd48f ("drm/locking: add backtrace for locking contended
locks without backoff") landing recently to -next adding a new stack depot
user in drivers/gpu/drm/drm_modeset_lock.c we need to add an appropriate
call to stack_depot_init() there as well.

Link: https://lkml.kernel.org/r/2a692365-cfa1-64f2-34e0-8aa5674dce5e@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Cc: Marco Elver <elver@google.com>
Cc: Vijayanand Jitta <vjitta@codeaurora.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Oliver Glitta <glittao@gmail.com>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
From: Vlastimil Babka <vbabka@suse.cz>
Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup4

Due to 4e66934eaadc ("lib: add reference counting tracking
infrastructure") landing recently to net-next adding a new stack depot
user in lib/ref_tracker.c we need to add an appropriate call to
stack_depot_init() there as well.

Link: https://lkml.kernel.org/r/45c1b738-1a2f-5b5f-2f6d-86fab206d01c@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Cc: Jiri Slab <jirislaby@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ref_tracker.h |  2 ++
 include/linux/stackdepot.h  | 25 ++++++++++++++++---------
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h
index c11c9db5825c..60f3453be23e 100644
--- a/include/linux/ref_tracker.h
+++ b/include/linux/ref_tracker.h
@@ -4,6 +4,7 @@
 #include <linux/refcount.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
+#include <linux/stackdepot.h>
 
 struct ref_tracker;
 
@@ -26,6 +27,7 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
 	spin_lock_init(&dir->lock);
 	dir->quarantine_avail = quarantine_count;
 	refcount_set(&dir->untracked, 1);
+	stack_depot_init();
 }
 
 void ref_tracker_dir_exit(struct ref_tracker_dir *dir);
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index c34b55a6e554..17f992fe6355 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -19,6 +19,22 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries,
 					unsigned int nr_entries,
 					gfp_t gfp_flags, bool can_alloc);
 
+/*
+ * Every user of stack depot has to call this during its own init when it's
+ * decided that it will be calling stack_depot_save() later.
+ *
+ * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot
+ * enabled as part of mm_init(), for subsystems where it's known at compile time
+ * that stack depot will be used.
+ */
+int stack_depot_init(void);
+
+#ifdef CONFIG_STACKDEPOT_ALWAYS_INIT
+static inline int stack_depot_early_init(void)	{ return stack_depot_init(); }
+#else
+static inline int stack_depot_early_init(void)	{ return 0; }
+#endif
+
 depot_stack_handle_t stack_depot_save(unsigned long *entries,
 				      unsigned int nr_entries, gfp_t gfp_flags);
 
@@ -30,13 +46,4 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
 
 void stack_depot_print(depot_stack_handle_t stack);
 
-#ifdef CONFIG_STACKDEPOT
-int stack_depot_init(void);
-#else
-static inline int stack_depot_init(void)
-{
-	return 0;
-}
-#endif	/* CONFIG_STACKDEPOT */
-
 #endif
-- 
cgit v1.2.3


From 0a4ee518185e902758191d968600399f3bc2be31 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Jan 2022 22:14:34 -0800
Subject: mm: remove cleancache

Patch series "remove Xen tmem leftovers".

Since the removal of the Xen tmem driver in 2019, the cleancache hooks
are entirely unused, as are large parts of frontswap.  This series
against linux-next (with the folio changes included) removes
cleancaches, and cuts down frontswap to the bits actually used by zswap.

This patch (of 13):

The cleancache subsystem is unused since the removal of Xen tmem driver
in commit 814bbf49dcd0 ("xen: remove tmem driver").

[akpm@linux-foundation.org: remove now-unreachable code]

Link: https://lkml.kernel.org/r/20211224062246.1258487-1-hch@lst.de
Link: https://lkml.kernel.org/r/20211224062246.1258487-2-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Konrad Rzeszutek Wilk <Konrad.wilk@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cleancache.h | 124 ---------------------------------------------
 include/linux/fs.h         |   5 --
 2 files changed, 129 deletions(-)
 delete mode 100644 include/linux/cleancache.h

(limited to 'include/linux')

diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
deleted file mode 100644
index 5f5730c1d324..000000000000
--- a/include/linux/cleancache.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_CLEANCACHE_H
-#define _LINUX_CLEANCACHE_H
-
-#include <linux/fs.h>
-#include <linux/exportfs.h>
-#include <linux/mm.h>
-
-#define CLEANCACHE_NO_POOL		-1
-#define CLEANCACHE_NO_BACKEND		-2
-#define CLEANCACHE_NO_BACKEND_SHARED	-3
-
-#define CLEANCACHE_KEY_MAX 6
-
-/*
- * cleancache requires every file with a page in cleancache to have a
- * unique key unless/until the file is removed/truncated.  For some
- * filesystems, the inode number is unique, but for "modern" filesystems
- * an exportable filehandle is required (see exportfs.h)
- */
-struct cleancache_filekey {
-	union {
-		ino_t ino;
-		__u32 fh[CLEANCACHE_KEY_MAX];
-		u32 key[CLEANCACHE_KEY_MAX];
-	} u;
-};
-
-struct cleancache_ops {
-	int (*init_fs)(size_t);
-	int (*init_shared_fs)(uuid_t *uuid, size_t);
-	int (*get_page)(int, struct cleancache_filekey,
-			pgoff_t, struct page *);
-	void (*put_page)(int, struct cleancache_filekey,
-			pgoff_t, struct page *);
-	void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t);
-	void (*invalidate_inode)(int, struct cleancache_filekey);
-	void (*invalidate_fs)(int);
-};
-
-extern int cleancache_register_ops(const struct cleancache_ops *ops);
-extern void __cleancache_init_fs(struct super_block *);
-extern void __cleancache_init_shared_fs(struct super_block *);
-extern int  __cleancache_get_page(struct page *);
-extern void __cleancache_put_page(struct page *);
-extern void __cleancache_invalidate_page(struct address_space *, struct page *);
-extern void __cleancache_invalidate_inode(struct address_space *);
-extern void __cleancache_invalidate_fs(struct super_block *);
-
-#ifdef CONFIG_CLEANCACHE
-#define cleancache_enabled (1)
-static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping)
-{
-	return mapping->host->i_sb->cleancache_poolid >= 0;
-}
-static inline bool cleancache_fs_enabled(struct page *page)
-{
-	return cleancache_fs_enabled_mapping(page->mapping);
-}
-#else
-#define cleancache_enabled (0)
-#define cleancache_fs_enabled(_page) (0)
-#define cleancache_fs_enabled_mapping(_page) (0)
-#endif
-
-/*
- * The shim layer provided by these inline functions allows the compiler
- * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE
- * is disabled, to a single global variable check if CONFIG_CLEANCACHE
- * is enabled but no cleancache "backend" has dynamically enabled it,
- * and, for the most frequent cleancache ops, to a single global variable
- * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled
- * and a cleancache backend has dynamically enabled cleancache, but the
- * filesystem referenced by that cleancache op has not enabled cleancache.
- * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially
- * no measurable performance impact.
- */
-
-static inline void cleancache_init_fs(struct super_block *sb)
-{
-	if (cleancache_enabled)
-		__cleancache_init_fs(sb);
-}
-
-static inline void cleancache_init_shared_fs(struct super_block *sb)
-{
-	if (cleancache_enabled)
-		__cleancache_init_shared_fs(sb);
-}
-
-static inline int cleancache_get_page(struct page *page)
-{
-	if (cleancache_enabled && cleancache_fs_enabled(page))
-		return __cleancache_get_page(page);
-	return -1;
-}
-
-static inline void cleancache_put_page(struct page *page)
-{
-	if (cleancache_enabled && cleancache_fs_enabled(page))
-		__cleancache_put_page(page);
-}
-
-static inline void cleancache_invalidate_page(struct address_space *mapping,
-					struct page *page)
-{
-	/* careful... page->mapping is NULL sometimes when this is called */
-	if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
-		__cleancache_invalidate_page(mapping, page);
-}
-
-static inline void cleancache_invalidate_inode(struct address_space *mapping)
-{
-	if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
-		__cleancache_invalidate_inode(mapping);
-}
-
-static inline void cleancache_invalidate_fs(struct super_block *sb)
-{
-	if (cleancache_enabled)
-		__cleancache_invalidate_fs(sb);
-}
-
-#endif /* _LINUX_CLEANCACHE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9617dea24978..f3daaea16554 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1535,11 +1535,6 @@ struct super_block {
 
 	const struct dentry_operations *s_d_op; /* default d_op for dentries */
 
-	/*
-	 * Saved pool identifier for cleancache (-1 means none)
-	 */
-	int cleancache_poolid;
-
 	struct shrinker s_shrink;	/* per-sb shrinker handle */
 
 	/* Number of inodes with nlink == 0 but still referenced */
-- 
cgit v1.2.3


From 3d6035f136009f9cae380022754cba31f32570c5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Jan 2022 22:14:38 -0800
Subject: frontswap: remove frontswap_writethrough

frontswap_writethrough is never called, so remove it.

Link: https://lkml.kernel.org/r/20211224062246.1258487-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Konrad Rzeszutek Wilk <Konrad.wilk@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index b07d88c92bb2..4a03fda41572 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -26,7 +26,6 @@ struct frontswap_ops {
 extern void frontswap_register_ops(struct frontswap_ops *ops);
 extern void frontswap_shrink(unsigned long);
 extern unsigned long frontswap_curr_pages(void);
-extern void frontswap_writethrough(bool);
 #define FRONTSWAP_HAS_EXCLUSIVE_GETS
 extern void frontswap_tmem_exclusive_gets(bool);
 
-- 
cgit v1.2.3


From 71024cb4a0bfe7767aec7a128d0a1a13a37b7fcd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Jan 2022 22:14:41 -0800
Subject: frontswap: remove frontswap_tmem_exclusive_gets

frontswap_tmem_exclusive_gets is never called, so remove it.

Link: https://lkml.kernel.org/r/20211224062246.1258487-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Konrad Rzeszutek Wilk <Konrad.wilk@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 4a03fda41572..83a56392cc7f 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -26,8 +26,6 @@ struct frontswap_ops {
 extern void frontswap_register_ops(struct frontswap_ops *ops);
 extern void frontswap_shrink(unsigned long);
 extern unsigned long frontswap_curr_pages(void);
-#define FRONTSWAP_HAS_EXCLUSIVE_GETS
-extern void frontswap_tmem_exclusive_gets(bool);
 
 extern bool __frontswap_test(struct swap_info_struct *, pgoff_t);
 extern void __frontswap_init(unsigned type, unsigned long *map);
-- 
cgit v1.2.3


From 0b364446d734da76e421dbfb09e5268270cefaf0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Jan 2022 22:14:44 -0800
Subject: frontswap: remove frontswap_shrink

frontswap_shrink is never called, so remove it.

Link: https://lkml.kernel.org/r/20211224062246.1258487-5-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Konrad Rzeszutek Wilk <Konrad.wilk@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 83a56392cc7f..d268d7bb6513 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -24,7 +24,6 @@ struct frontswap_ops {
 };
 
 extern void frontswap_register_ops(struct frontswap_ops *ops);
-extern void frontswap_shrink(unsigned long);
 extern unsigned long frontswap_curr_pages(void);
 
 extern bool __frontswap_test(struct swap_info_struct *, pgoff_t);
-- 
cgit v1.2.3


From 3e8e1af63d7a831f576477c25d9b89049bd2d53d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Jan 2022 22:14:47 -0800
Subject: frontswap: remove frontswap_curr_pages

frontswap_curr_pages is never called, so remove it.

Link: https://lkml.kernel.org/r/20211224062246.1258487-6-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Konrad Rzeszutek Wilk <Konrad.wilk@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index d268d7bb6513..5205c2977b20 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -24,7 +24,6 @@ struct frontswap_ops {
 };
 
 extern void frontswap_register_ops(struct frontswap_ops *ops);
-extern unsigned long frontswap_curr_pages(void);
 
 extern bool __frontswap_test(struct swap_info_struct *, pgoff_t);
 extern void __frontswap_init(unsigned type, unsigned long *map);
-- 
cgit v1.2.3


From 1cf53c894d15dd4b73397a56fa055d76d3db66b4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Jan 2022 22:14:51 -0800
Subject: frontswap: simplify frontswap_init

Just use IS_ENABLED() and remove the __frontswap_init indirection.

Also remove the unused export.

Link: https://lkml.kernel.org/r/20211224062246.1258487-7-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Konrad Rzeszutek Wilk <Konrad.wilk@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 5205c2977b20..73d7beb44f2b 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -26,7 +26,7 @@ struct frontswap_ops {
 extern void frontswap_register_ops(struct frontswap_ops *ops);
 
 extern bool __frontswap_test(struct swap_info_struct *, pgoff_t);
-extern void __frontswap_init(unsigned type, unsigned long *map);
+extern void frontswap_init(unsigned type, unsigned long *map);
 extern int __frontswap_store(struct page *page);
 extern int __frontswap_load(struct page *page);
 extern void __frontswap_invalidate_page(unsigned, pgoff_t);
@@ -107,11 +107,4 @@ static inline void frontswap_invalidate_area(unsigned type)
 		__frontswap_invalidate_area(type);
 }
 
-static inline void frontswap_init(unsigned type, unsigned long *map)
-{
-#ifdef CONFIG_FRONTSWAP
-	__frontswap_init(type, map);
-#endif
-}
-
 #endif /* _LINUX_FRONTSWAP_H */
-- 
cgit v1.2.3


From 10a9c496789fe2098bfc018650fc77b23ba08a54 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Jan 2022 22:14:57 -0800
Subject: mm: simplify try_to_unuse

Remove the unused frontswap and pages_to_unuse arguments, and mark the
function static now that the caller in frontswap is gone.

[akpm@linux-foundation.org: fix shmem_unuse() stub, per Matthew]

Link: https://lkml.kernel.org/r/20211224062246.1258487-9-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Konrad Rzeszutek Wilk <Konrad.wilk@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 7 -------
 include/linux/shmem_fs.h  | 3 +--
 include/linux/swapfile.h  | 1 -
 3 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 73d7beb44f2b..a9817d4fa74c 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -7,13 +7,6 @@
 #include <linux/bitops.h>
 #include <linux/jump_label.h>
 
-/*
- * Return code to denote that requested number of
- * frontswap pages are unused(moved to page cache).
- * Used in shmem_unuse and try_to_unuse.
- */
-#define FRONTSWAP_PAGES_UNUSED	2
-
 struct frontswap_ops {
 	void (*init)(unsigned); /* this swap type was just swapon'ed */
 	int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 166158b6e917..e65b80ed09e7 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -83,8 +83,7 @@ extern void shmem_unlock_mapping(struct address_space *mapping);
 extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 					pgoff_t index, gfp_t gfp_mask);
 extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
-extern int shmem_unuse(unsigned int type, bool frontswap,
-		       unsigned long *fs_pages_to_unuse);
+int shmem_unuse(unsigned int type);
 
 extern bool shmem_is_huge(struct vm_area_struct *vma,
 			  struct inode *inode, pgoff_t index);
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index e06febf62978..809cd01ef2c5 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -9,7 +9,6 @@
 extern spinlock_t swap_lock;
 extern struct plist_head swap_active_head;
 extern struct swap_info_struct *swap_info[];
-extern int try_to_unuse(unsigned int, bool, unsigned long);
 extern unsigned long generic_max_swapfile_size(void);
 extern unsigned long max_swapfile_size(void);
 
-- 
cgit v1.2.3


From bd9cd521496ba8d537d8f46f4167bf4221aba9a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Jan 2022 22:15:01 -0800
Subject: frontswap: remove frontswap_test

frontswap_test is unused now, remove it.

Link: https://lkml.kernel.org/r/20211224062246.1258487-10-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Konrad Rzeszutek Wilk <Konrad.wilk@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index a9817d4fa74c..c5b2848d2240 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -18,7 +18,6 @@ struct frontswap_ops {
 
 extern void frontswap_register_ops(struct frontswap_ops *ops);
 
-extern bool __frontswap_test(struct swap_info_struct *, pgoff_t);
 extern void frontswap_init(unsigned type, unsigned long *map);
 extern int __frontswap_store(struct page *page);
 extern int __frontswap_load(struct page *page);
@@ -33,11 +32,6 @@ static inline bool frontswap_enabled(void)
 	return static_branch_unlikely(&frontswap_enabled_key);
 }
 
-static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
-{
-	return __frontswap_test(sis, offset);
-}
-
 static inline void frontswap_map_set(struct swap_info_struct *p,
 				     unsigned long *map)
 {
@@ -56,11 +50,6 @@ static inline bool frontswap_enabled(void)
 	return false;
 }
 
-static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
-{
-	return false;
-}
-
 static inline void frontswap_map_set(struct swap_info_struct *p,
 				     unsigned long *map)
 {
-- 
cgit v1.2.3


From 633423a09cb5cfe61438283e1ce49c23cf4a0611 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Jan 2022 22:15:07 -0800
Subject: mm: mark swap_lock and swap_active_head static

swap_lock and swap_active_head are only used in swapfile.c, so mark them
static.

Link: https://lkml.kernel.org/r/20211224062246.1258487-12-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Konrad Rzeszutek Wilk <Konrad.wilk@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swapfile.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 809cd01ef2c5..54078542134c 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -6,8 +6,6 @@
  * these were static in swapfile.c but frontswap.c needs them and we don't
  * want to expose them to the dozens of source files that include swap.h
  */
-extern spinlock_t swap_lock;
-extern struct plist_head swap_active_head;
 extern struct swap_info_struct *swap_info[];
 extern unsigned long generic_max_swapfile_size(void);
 extern unsigned long max_swapfile_size(void);
-- 
cgit v1.2.3


From 1da0d94a3ec8c5f3793b7be8538b55e60ebeefe3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Jan 2022 22:15:10 -0800
Subject: frontswap: remove support for multiple ops

There is only a single instance of frontswap ops in the kernel, so
simplify the frontswap code by removing support for multiple operations.

Link: https://lkml.kernel.org/r/20211224062246.1258487-13-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Konrad Rzeszutek Wilk <Konrad.wilk@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index c5b2848d2240..a631bac12220 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -13,10 +13,9 @@ struct frontswap_ops {
 	int (*load)(unsigned, pgoff_t, struct page *); /* load a page */
 	void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */
 	void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */
-	struct frontswap_ops *next; /* private pointer to next ops */
 };
 
-extern void frontswap_register_ops(struct frontswap_ops *ops);
+int frontswap_register_ops(const struct frontswap_ops *ops);
 
 extern void frontswap_init(unsigned type, unsigned long *map);
 extern int __frontswap_store(struct page *page);
-- 
cgit v1.2.3


From a37d9a17f099072fe4d3a9048b0321978707a918 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 20 Jan 2022 23:53:04 +0200
Subject: fsnotify: invalidate dcache before IN_DELETE event

Apparently, there are some applications that use IN_DELETE event as an
invalidation mechanism and expect that if they try to open a file with
the name reported with the delete event, that it should not contain the
content of the deleted file.

Commit 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of
d_delete()") moved the fsnotify delete hook before d_delete() so fsnotify
will have access to a positive dentry.

This allowed a race where opening the deleted file via cached dentry
is now possible after receiving the IN_DELETE event.

To fix the regression, create a new hook fsnotify_delete() that takes
the unlinked inode as an argument and use a helper d_delete_notify() to
pin the inode, so we can pass it to fsnotify_delete() after d_delete().

Backporting hint: this regression is from v5.3. Although patch will
apply with only trivial conflicts to v5.4 and v5.10, it won't build,
because fsnotify_delete() implementation is different in each of those
versions (see fsnotify_link()).

A follow up patch will fix the fsnotify_unlink/rmdir() calls in pseudo
filesystem that do not need to call d_delete().

Link: https://lore.kernel.org/r/20220120215305.282577-1-amir73il@gmail.com
Reported-by: Ivan Delalande <colona@arista.com>
Link: https://lore.kernel.org/linux-fsdevel/YeNyzoDM5hP5LtGW@visor/
Fixes: 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of d_delete()")
Cc: stable@vger.kernel.org # v5.3+
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify.h | 49 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 3a2d7dc3c607..bb8467cd11ae 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -224,6 +224,43 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode,
 		      dir, &new_dentry->d_name, 0);
 }
 
+/*
+ * fsnotify_delete - @dentry was unlinked and unhashed
+ *
+ * Caller must make sure that dentry->d_name is stable.
+ *
+ * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode
+ * as this may be called after d_delete() and old_dentry may be negative.
+ */
+static inline void fsnotify_delete(struct inode *dir, struct inode *inode,
+				   struct dentry *dentry)
+{
+	__u32 mask = FS_DELETE;
+
+	if (S_ISDIR(inode->i_mode))
+		mask |= FS_ISDIR;
+
+	fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name,
+		      0);
+}
+
+/**
+ * d_delete_notify - delete a dentry and call fsnotify_delete()
+ * @dentry: The dentry to delete
+ *
+ * This helper is used to guaranty that the unlinked inode cannot be found
+ * by lookup of this name after fsnotify_delete() event has been delivered.
+ */
+static inline void d_delete_notify(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+
+	ihold(inode);
+	d_delete(dentry);
+	fsnotify_delete(dir, inode, dentry);
+	iput(inode);
+}
+
 /*
  * fsnotify_unlink - 'name' was unlinked
  *
@@ -231,10 +268,10 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode,
  */
 static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
 {
-	/* Expected to be called before d_delete() */
-	WARN_ON_ONCE(d_is_negative(dentry));
+	if (WARN_ON_ONCE(d_is_negative(dentry)))
+		return;
 
-	fsnotify_dirent(dir, dentry, FS_DELETE);
+	fsnotify_delete(dir, d_inode(dentry), dentry);
 }
 
 /*
@@ -258,10 +295,10 @@ static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry)
  */
 static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	/* Expected to be called before d_delete() */
-	WARN_ON_ONCE(d_is_negative(dentry));
+	if (WARN_ON_ONCE(d_is_negative(dentry)))
+		return;
 
-	fsnotify_dirent(dir, dentry, FS_DELETE | FS_ISDIR);
+	fsnotify_delete(dir, d_inode(dentry), dentry);
 }
 
 /*
-- 
cgit v1.2.3


From 9daf0a4d32d60a57f2a2533bdf4c178be7fdff7f Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sun, 16 Jan 2022 04:59:36 -0800
Subject: quota: cleanup double word in comment

Remove the second 'handle'.

Link: https://lore.kernel.org/r/20220116125936.389767-1-trix@redhat.com
Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/quota.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/quota.h b/include/linux/quota.h
index 18ebd39c9487..fd692b4a41d5 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -91,7 +91,7 @@ extern bool qid_valid(struct kqid qid);
  *
  *	When there is no mapping defined for the user-namespace, type,
  *	qid tuple an invalid kqid is returned.  Callers are expected to
- *	test for and handle handle invalid kqids being returned.
+ *	test for and handle invalid kqids being returned.
  *	Invalid kqids may be tested for using qid_valid().
  */
 static inline struct kqid make_kqid(struct user_namespace *from,
-- 
cgit v1.2.3


From 945c37ed564770c78dfe6b9f08bed57a1b4e60ef Mon Sep 17 00:00:00 2001
From: Linyu Yuan <quic_linyyuan@quicinc.com>
Date: Mon, 10 Jan 2022 20:43:28 +0800
Subject: usb: roles: fix include/linux/usb/role.h compile issue

when CONFIG_USB_ROLE_SWITCH is not defined,
add usb_role_switch_find_by_fwnode() definition which return NULL.

Fixes: c6919d5e0cd1 ("usb: roles: Add usb_role_switch_find_by_fwnode()")
Signed-off-by: Linyu Yuan <quic_linyyuan@quicinc.com>
Link: https://lore.kernel.org/r/1641818608-25039-1-git-send-email-quic_linyyuan@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/role.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/role.h b/include/linux/usb/role.h
index 031f148ab373..b5deafd91f67 100644
--- a/include/linux/usb/role.h
+++ b/include/linux/usb/role.h
@@ -91,6 +91,12 @@ fwnode_usb_role_switch_get(struct fwnode_handle *node)
 
 static inline void usb_role_switch_put(struct usb_role_switch *sw) { }
 
+static inline struct usb_role_switch *
+usb_role_switch_find_by_fwnode(const struct fwnode_handle *fwnode)
+{
+	return NULL;
+}
+
 static inline struct usb_role_switch *
 usb_role_switch_register(struct device *parent,
 			 const struct usb_role_switch_desc *desc)
-- 
cgit v1.2.3


From 33569ef3c754a82010f266b7b938a66a3ccf90a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?=
 <amadeuszx.slawinski@linux.intel.com>
Date: Wed, 19 Jan 2022 11:47:51 +0100
Subject: PM: hibernate: Remove register_nosave_region_late()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is an unused wrapper forcing kmalloc allocation for registering
nosave regions. Also, rename __register_nosave_region() to
register_nosave_region() now that there is no need for disambiguation.

Signed-off-by: Amadeusz Sławiński <amadeuszx.slawinski@linux.intel.com>
Reviewed-by: Cezary Rojewski <cezary.rojewski@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/suspend.h | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 5785d909c321..3e8ecdebe601 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -430,15 +430,7 @@ struct platform_hibernation_ops {
 
 #ifdef CONFIG_HIBERNATION
 /* kernel/power/snapshot.c */
-extern void __register_nosave_region(unsigned long b, unsigned long e, int km);
-static inline void __init register_nosave_region(unsigned long b, unsigned long e)
-{
-	__register_nosave_region(b, e, 0);
-}
-static inline void __init register_nosave_region_late(unsigned long b, unsigned long e)
-{
-	__register_nosave_region(b, e, 1);
-}
+extern void register_nosave_region(unsigned long b, unsigned long e);
 extern int swsusp_page_is_forbidden(struct page *);
 extern void swsusp_set_page_free(struct page *);
 extern void swsusp_unset_page_free(struct page *);
@@ -458,7 +450,6 @@ int pfn_is_nosave(unsigned long pfn);
 int hibernate_quiet_exec(int (*func)(void *data), void *data);
 #else /* CONFIG_HIBERNATION */
 static inline void register_nosave_region(unsigned long b, unsigned long e) {}
-static inline void register_nosave_region_late(unsigned long b, unsigned long e) {}
 static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
 static inline void swsusp_set_page_free(struct page *p) {}
 static inline void swsusp_unset_page_free(struct page *p) {}
-- 
cgit v1.2.3


From ebb7fb1557b1d03b906b668aa2164b51e6b7d19a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 26 Jan 2022 09:19:20 -0800
Subject: xfs, iomap: limit individual ioend chain lengths in writeback

Trond Myklebust reported soft lockups in XFS IO completion such as
this:

 watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
 CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
 Workqueue: xfs-conv/md127 xfs_end_io [xfs]
 RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
 Call Trace:
  wake_up_page_bit+0x8a/0x110
  iomap_finish_ioend+0xd7/0x1c0
  iomap_finish_ioends+0x7f/0xb0
  xfs_end_ioend+0x6b/0x100 [xfs]
  xfs_end_io+0xb9/0xe0 [xfs]
  process_one_work+0x1a7/0x360
  worker_thread+0x1fa/0x390
  kthread+0x116/0x130
  ret_from_fork+0x35/0x40

Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit.  Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.

If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.

We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.

OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.

XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.

This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.

Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:

1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.

2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.

3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/iomap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index b55bd49e55f5..97a3a2edb585 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -263,9 +263,11 @@ struct iomap_ioend {
 	struct list_head	io_list;	/* next ioend in chain */
 	u16			io_type;
 	u16			io_flags;	/* IOMAP_F_* */
+	u32			io_folios;	/* folios added to ioend */
 	struct inode		*io_inode;	/* file being written to */
 	size_t			io_size;	/* size of the extent */
 	loff_t			io_offset;	/* offset in the file */
+	sector_t		io_sector;	/* start sector of ioend */
 	struct bio		*io_bio;	/* bio being built */
 	struct bio		io_inline_bio;	/* MUST BE LAST! */
 };
-- 
cgit v1.2.3


From d7e4f8545b497b3f5687e592f1c355cbaee64c8c Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Wed, 26 Jan 2022 13:04:26 +0800
Subject: pid: Introduce helper task_is_in_init_pid_ns()

Currently the kernel uses open code in multiple places to check if a
task is in the root PID namespace with the kind of format:

  if (task_active_pid_ns(current) == &init_pid_ns)
      do_something();

This patch creates a new helper function, task_is_in_init_pid_ns(), it
returns true if a passed task is in the root PID namespace, otherwise
returns false.  So it will be used to replace open codes.

Suggested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/pid_namespace.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 7c7e627503d2..07481bb87d4e 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -86,4 +86,9 @@ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
 void pidhash_init(void);
 void pid_idr_init(void);
 
+static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
+{
+	return task_active_pid_ns(tsk) == &init_pid_ns;
+}
+
 #endif /* _LINUX_PID_NS_H */
-- 
cgit v1.2.3


From 364df53c081d93fcfd6b91085ff2650c7f17b3c7 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Thu, 27 Jan 2022 17:13:01 +0800
Subject: net: socket: rename SKB_DROP_REASON_SOCKET_FILTER

Rename SKB_DROP_REASON_SOCKET_FILTER, which is used
as the reason of skb drop out of socket filter before
it's part of a released kernel. It will be used for
more protocols than just TCP in future series.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/all/20220127091308.91401-2-imagedong@tencent.com/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bf11e1fbd69b..8a636e678902 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -318,7 +318,7 @@ enum skb_drop_reason {
 	SKB_DROP_REASON_NO_SOCKET,
 	SKB_DROP_REASON_PKT_TOO_SMALL,
 	SKB_DROP_REASON_TCP_CSUM,
-	SKB_DROP_REASON_TCP_FILTER,
+	SKB_DROP_REASON_SOCKET_FILTER,
 	SKB_DROP_REASON_UDP_CSUM,
 	SKB_DROP_REASON_MAX,
 };
-- 
cgit v1.2.3


From 7f5056b9e7b71149bf11073f00a57fa1ac2921a9 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 26 Jan 2022 15:35:14 -0500
Subject: security, lsm: dentry_init_security() Handle multi LSM registration

A ceph user has reported that ceph is crashing with kernel NULL pointer
dereference. Following is the backtrace.

/proc/version: Linux version 5.16.2-arch1-1 (linux@archlinux) (gcc (GCC)
11.1.0, GNU ld (GNU Binutils) 2.36.1) #1 SMP PREEMPT Thu, 20 Jan 2022
16:18:29 +0000
distro / arch: Arch Linux / x86_64
SELinux is not enabled
ceph cluster version: 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503)

relevant dmesg output:
[   30.947129] BUG: kernel NULL pointer dereference, address:
0000000000000000
[   30.947206] #PF: supervisor read access in kernel mode
[   30.947258] #PF: error_code(0x0000) - not-present page
[   30.947310] PGD 0 P4D 0
[   30.947342] Oops: 0000 [#1] PREEMPT SMP PTI
[   30.947388] CPU: 5 PID: 778 Comm: touch Not tainted 5.16.2-arch1-1 #1
86fbf2c313cc37a553d65deb81d98e9dcc2a3659
[   30.947486] Hardware name: Gigabyte Technology Co., Ltd. B365M
DS3H/B365M DS3H, BIOS F5 08/13/2019
[   30.947569] RIP: 0010:strlen+0x0/0x20
[   30.947616] Code: b6 07 38 d0 74 16 48 83 c7 01 84 c0 74 05 48 39 f7 75
ec 31 c0 31 d2 89 d6 89 d7 c3 48 89 f8 31 d2 89 d6 89 d7 c3 0
f 1f 40 00 <80> 3f 00 74 12 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 31
ff
[   30.947782] RSP: 0018:ffffa4ed80ffbbb8 EFLAGS: 00010246
[   30.947836] RAX: 0000000000000000 RBX: ffffa4ed80ffbc60 RCX:
0000000000000000
[   30.947904] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
0000000000000000
[   30.947971] RBP: ffff94b0d15c0ae0 R08: 0000000000000000 R09:
0000000000000000
[   30.948040] R10: 0000000000000000 R11: 0000000000000000 R12:
0000000000000000
[   30.948106] R13: 0000000000000001 R14: ffffa4ed80ffbc60 R15:
0000000000000000
[   30.948174] FS:  00007fc7520f0740(0000) GS:ffff94b7ced40000(0000)
knlGS:0000000000000000
[   30.948252] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   30.948308] CR2: 0000000000000000 CR3: 0000000104a40001 CR4:
00000000003706e0
[   30.948376] Call Trace:
[   30.948404]  <TASK>
[   30.948431]  ceph_security_init_secctx+0x7b/0x240 [ceph
49f9c4b9bf5be8760f19f1747e26da33920bce4b]
[   30.948582]  ceph_atomic_open+0x51e/0x8a0 [ceph
49f9c4b9bf5be8760f19f1747e26da33920bce4b]
[   30.948708]  ? get_cached_acl+0x4d/0xa0
[   30.948759]  path_openat+0x60d/0x1030
[   30.948809]  do_filp_open+0xa5/0x150
[   30.948859]  do_sys_openat2+0xc4/0x190
[   30.948904]  __x64_sys_openat+0x53/0xa0
[   30.948948]  do_syscall_64+0x5c/0x90
[   30.948989]  ? exc_page_fault+0x72/0x180
[   30.949034]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   30.949091] RIP: 0033:0x7fc7521e25bb
[   30.950849] Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00
00 00 85 c0 75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 0
0 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 91 00 00 00 48 8b 54 24 28 64 48 2b 14
25

Core of the problem is that ceph checks for return code from
security_dentry_init_security() and if return code is 0, it assumes
everything is fine and continues to call strlen(name), which crashes.

Typically SELinux LSM returns 0 and sets name to "security.selinux" and
it is not a problem. Or if selinux is not compiled in or disabled, it
returns -EOPNOTSUP and ceph deals with it.

But somehow in this configuration, 0 is being returned and "name" is
not being initialized and that's creating the problem.

Our suspicion is that BPF LSM is registering a hook for
dentry_init_security() and returns hook default of 0.

LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry,...)

I have not been able to reproduce it just by doing CONFIG_BPF_LSM=y.
Stephen has tested the patch though and confirms it solves the problem
for him.

dentry_init_security() is written in such a way that it expects only one
LSM to register the hook. Atleast that's the expectation with current code.

If another LSM returns a hook and returns default, it will simply return
0 as of now and that will break ceph.

Hence, suggestion is that change semantics of this hook a bit. If there
are no LSMs or no LSM is taking ownership and initializing security context,
then return -EOPNOTSUP. Also allow at max one LSM to initialize security
context. This hook can't deal with multiple LSMs trying to init security
context. This patch implements this new behavior.

Reported-by: Stephen Muth <smuth4@gmail.com>
Tested-by: Stephen Muth <smuth4@gmail.com>
Suggested-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: <stable@vger.kernel.org> # 5.16.0
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/lsm_hook_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index df8de62f4710..f0c7b352340a 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -82,7 +82,7 @@ LSM_HOOK(int, 0, sb_add_mnt_opt, const char *option, const char *val,
 	 int len, void **mnt_opts)
 LSM_HOOK(int, 0, move_mount, const struct path *from_path,
 	 const struct path *to_path)
-LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry,
+LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry,
 	 int mode, const struct qstr *name, const char **xattr_name,
 	 void **ctx, u32 *ctxlen)
 LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode,
-- 
cgit v1.2.3


From e45c47d1f94e0cc7b6b079fdb4bcce2995e2adc4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 28 Jan 2022 10:58:39 -0500
Subject: block: add bio_start_io_acct_time() to control start_time

bio_start_io_acct_time() interface is like bio_start_io_acct() that
allows start_time to be passed in. This gives drivers the ability to
defer starting accounting until after IO is issued (but possibily not
entirely due to bio splitting).

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220128155841.39644-2-snitzer@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9c95df26fc26..f35aea98bc35 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1258,6 +1258,7 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
 void disk_end_io_acct(struct gendisk *disk, unsigned int op,
 		unsigned long start_time);
 
+void bio_start_io_acct_time(struct bio *bio, unsigned long start_time);
 unsigned long bio_start_io_acct(struct bio *bio);
 void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
 		struct block_device *orig_bdev);
-- 
cgit v1.2.3


From 6cb917411e028dcb66ce8f5db1b47361b78d7d3f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 29 Jan 2022 13:40:52 -0800
Subject: include/linux/sysctl.h: fix register_sysctl_mount_point() return type

The CONFIG_SYSCTL=n stub returns the wrong type.

Fixes: ee9efac48a082 ("sysctl: add helper to register a sysctl mount point")
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Tong Zhang <ztong0001@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 180adf7da785..6353d6db69b2 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -265,7 +265,7 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table *
 	return NULL;
 }
 
-static inline struct sysctl_header *register_sysctl_mount_point(const char *path)
+static inline struct ctl_table_header *register_sysctl_mount_point(const char *path)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 536f4217ced62b671bd759f6b549621a5654a70f Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Sat, 29 Jan 2022 13:41:04 -0800
Subject: mm: page->mapping folio->mapping should have the same offset

As with the other members of folio, the offset of page->mapping and
folio->mapping must be the same.  The compile-time check was
inadvertently removed during development.  Add it back.

[willy@infradead.org: changelog redo]

Link: https://lkml.kernel.org/r/20220104011734.21714-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9db36dc5d4cf..5140e5feb486 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -261,6 +261,7 @@ static_assert(sizeof(struct page) == sizeof(struct folio));
 	static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl))
 FOLIO_MATCH(flags, flags);
 FOLIO_MATCH(lru, lru);
+FOLIO_MATCH(mapping, mapping);
 FOLIO_MATCH(compound_head, lru);
 FOLIO_MATCH(index, index);
 FOLIO_MATCH(private, private);
-- 
cgit v1.2.3


From 27fe73394a1c6d0b07fa4d95f1bca116d1cc66e9 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Sat, 29 Jan 2022 13:41:14 -0800
Subject: mm, kasan: use compare-exchange operation to set KASAN page tag

It has been reported that the tag setting operation on newly-allocated
pages can cause the page flags to be corrupted when performed
concurrently with other flag updates as a result of the use of
non-atomic operations.

Fix the problem by using a compare-exchange loop to update the tag.

Link: https://lkml.kernel.org/r/20220120020148.1632253-1-pcc@google.com
Link: https://linux-review.googlesource.com/id/I456b24a2b9067d93968d43b4bb3351c0cec63101
Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc")
Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e1a84b1e6787..213cc569b192 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1506,11 +1506,18 @@ static inline u8 page_kasan_tag(const struct page *page)
 
 static inline void page_kasan_tag_set(struct page *page, u8 tag)
 {
-	if (kasan_enabled()) {
-		tag ^= 0xff;
-		page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
-		page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
-	}
+	unsigned long old_flags, flags;
+
+	if (!kasan_enabled())
+		return;
+
+	tag ^= 0xff;
+	old_flags = READ_ONCE(page->flags);
+	do {
+		flags = old_flags;
+		flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
+		flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+	} while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
 }
 
 static inline void page_kasan_tag_reset(struct page *page)
-- 
cgit v1.2.3


From 51e50fbd3efc6064c30ed73a5e009018b36e290a Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Sat, 29 Jan 2022 13:41:17 -0800
Subject: psi: fix "no previous prototype" warnings when CONFIG_CGROUPS=n

When CONFIG_CGROUPS is disabled psi code generates the following
warnings:

  kernel/sched/psi.c:1112:21: warning: no previous prototype for 'psi_trigger_create' [-Wmissing-prototypes]
      1112 | struct psi_trigger *psi_trigger_create(struct psi_group *group,
           |                     ^~~~~~~~~~~~~~~~~~
  kernel/sched/psi.c:1182:6: warning: no previous prototype for 'psi_trigger_destroy' [-Wmissing-prototypes]
      1182 | void psi_trigger_destroy(struct psi_trigger *t)
           |      ^~~~~~~~~~~~~~~~~~~
  kernel/sched/psi.c:1249:10: warning: no previous prototype for 'psi_trigger_poll' [-Wmissing-prototypes]
      1249 | __poll_t psi_trigger_poll(void **trigger_ptr,
           |          ^~~~~~~~~~~~~~~~

Change the declarations of these functions in the header to provide the
prototypes even when they are unused.

Link: https://lkml.kernel.org/r/20220119223940.787748-2-surenb@google.com
Fixes: 0e94682b73bf ("psi: introduce psi monitor")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/psi.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/psi.h b/include/linux/psi.h
index f8ce53bfdb2a..7f7d1d88c3bb 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -25,18 +25,17 @@ void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
 
 int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
-
-#ifdef CONFIG_CGROUPS
-int psi_cgroup_alloc(struct cgroup *cgrp);
-void psi_cgroup_free(struct cgroup *cgrp);
-void cgroup_move_task(struct task_struct *p, struct css_set *to);
-
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
 			char *buf, size_t nbytes, enum psi_res res);
 void psi_trigger_destroy(struct psi_trigger *t);
 
 __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
 			poll_table *wait);
+
+#ifdef CONFIG_CGROUPS
+int psi_cgroup_alloc(struct cgroup *cgrp);
+void psi_cgroup_free(struct cgroup *cgrp);
+void cgroup_move_task(struct task_struct *p, struct css_set *to);
 #endif
 
 #else /* CONFIG_PSI */
-- 
cgit v1.2.3


From ef9989afda73332df566852d6e9ca695c05f10ce Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 1 Feb 2022 13:29:22 +0000
Subject: kvm: add guest_state_{enter,exit}_irqoff()

When transitioning to/from guest mode, it is necessary to inform
lockdep, tracing, and RCU in a specific order, similar to the
requirements for transitions to/from user mode. Additionally, it is
necessary to perform vtime accounting for a window around running the
guest, with RCU enabled, such that timer interrupts taken from the guest
can be accounted as guest time.

Most architectures don't handle all the necessary pieces, and a have a
number of common bugs, including unsafe usage of RCU during the window
between guest_enter() and guest_exit().

On x86, this was dealt with across commits:

  87fa7f3e98a1310e ("x86/kvm: Move context tracking where it belongs")
  0642391e2139a2c1 ("x86/kvm/vmx: Add hardirq tracing to guest enter/exit")
  9fc975e9efd03e57 ("x86/kvm/svm: Add hardirq tracing on guest enter/exit")
  3ebccdf373c21d86 ("x86/kvm/vmx: Move guest enter/exit into .noinstr.text")
  135961e0a7d555fc ("x86/kvm/svm: Move guest enter/exit into .noinstr.text")
  160457140187c5fb ("KVM: x86: Defer vtime accounting 'til after IRQ handling")
  bc908e091b326467 ("KVM: x86: Consolidate guest enter/exit logic to common helpers")

... but those fixes are specific to x86, and as the resulting logic
(while correct) is split across generic helper functions and
x86-specific helper functions, it is difficult to see that the
entry/exit accounting is balanced.

This patch adds generic helpers which architectures can use to handle
guest entry/exit consistently and correctly. The guest_{enter,exit}()
helpers are split into guest_timing_{enter,exit}() to perform vtime
accounting, and guest_context_{enter,exit}() to perform the necessary
context tracking and RCU management. The existing guest_{enter,exit}()
heleprs are left as wrappers of these.

Atop this, new guest_state_enter_irqoff() and guest_state_exit_irqoff()
helpers are added to handle the ordering of lockdep, tracing, and RCU
manageent. These are inteneded to mirror exit_to_user_mode() and
enter_from_user_mode().

Subsequent patches will migrate architectures over to the new helpers,
following a sequence:

	guest_timing_enter_irqoff();

	guest_state_enter_irqoff();
	< run the vcpu >
	guest_state_exit_irqoff();

	< take any pending IRQs >

	guest_timing_exit_irqoff();

This sequences handles all of the above correctly, and more clearly
balances the entry and exit portions, making it easier to understand.

The existing helpers are marked as deprecated, and will be removed once
all architectures have been converted.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Message-Id: <20220201132926.3301912-2-mark.rutland@arm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 112 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 109 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f079820f52b5..b3810976a27f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -29,7 +29,9 @@
 #include <linux/refcount.h>
 #include <linux/nospec.h>
 #include <linux/notifier.h>
+#include <linux/ftrace.h>
 #include <linux/hashtable.h>
+#include <linux/instrumentation.h>
 #include <linux/interval_tree.h>
 #include <linux/rbtree.h>
 #include <linux/xarray.h>
@@ -368,8 +370,11 @@ struct kvm_vcpu {
 	u64 last_used_slot_gen;
 };
 
-/* must be called with irqs disabled */
-static __always_inline void guest_enter_irqoff(void)
+/*
+ * Start accounting time towards a guest.
+ * Must be called before entering guest context.
+ */
+static __always_inline void guest_timing_enter_irqoff(void)
 {
 	/*
 	 * This is running in ioctl context so its safe to assume that it's the
@@ -378,7 +383,18 @@ static __always_inline void guest_enter_irqoff(void)
 	instrumentation_begin();
 	vtime_account_guest_enter();
 	instrumentation_end();
+}
 
+/*
+ * Enter guest context and enter an RCU extended quiescent state.
+ *
+ * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is
+ * unsafe to use any code which may directly or indirectly use RCU, tracing
+ * (including IRQ flag tracing), or lockdep. All code in this period must be
+ * non-instrumentable.
+ */
+static __always_inline void guest_context_enter_irqoff(void)
+{
 	/*
 	 * KVM does not hold any references to rcu protected data when it
 	 * switches CPU into a guest mode. In fact switching to a guest mode
@@ -394,16 +410,79 @@ static __always_inline void guest_enter_irqoff(void)
 	}
 }
 
-static __always_inline void guest_exit_irqoff(void)
+/*
+ * Deprecated. Architectures should move to guest_timing_enter_irqoff() and
+ * guest_state_enter_irqoff().
+ */
+static __always_inline void guest_enter_irqoff(void)
+{
+	guest_timing_enter_irqoff();
+	guest_context_enter_irqoff();
+}
+
+/**
+ * guest_state_enter_irqoff - Fixup state when entering a guest
+ *
+ * Entry to a guest will enable interrupts, but the kernel state is interrupts
+ * disabled when this is invoked. Also tell RCU about it.
+ *
+ * 1) Trace interrupts on state
+ * 2) Invoke context tracking if enabled to adjust RCU state
+ * 3) Tell lockdep that interrupts are enabled
+ *
+ * Invoked from architecture specific code before entering a guest.
+ * Must be called with interrupts disabled and the caller must be
+ * non-instrumentable.
+ * The caller has to invoke guest_timing_enter_irqoff() before this.
+ *
+ * Note: this is analogous to exit_to_user_mode().
+ */
+static __always_inline void guest_state_enter_irqoff(void)
+{
+	instrumentation_begin();
+	trace_hardirqs_on_prepare();
+	lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+	instrumentation_end();
+
+	guest_context_enter_irqoff();
+	lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
+/*
+ * Exit guest context and exit an RCU extended quiescent state.
+ *
+ * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is
+ * unsafe to use any code which may directly or indirectly use RCU, tracing
+ * (including IRQ flag tracing), or lockdep. All code in this period must be
+ * non-instrumentable.
+ */
+static __always_inline void guest_context_exit_irqoff(void)
 {
 	context_tracking_guest_exit();
+}
 
+/*
+ * Stop accounting time towards a guest.
+ * Must be called after exiting guest context.
+ */
+static __always_inline void guest_timing_exit_irqoff(void)
+{
 	instrumentation_begin();
 	/* Flush the guest cputime we spent on the guest */
 	vtime_account_guest_exit();
 	instrumentation_end();
 }
 
+/*
+ * Deprecated. Architectures should move to guest_state_exit_irqoff() and
+ * guest_timing_exit_irqoff().
+ */
+static __always_inline void guest_exit_irqoff(void)
+{
+	guest_context_exit_irqoff();
+	guest_timing_exit_irqoff();
+}
+
 static inline void guest_exit(void)
 {
 	unsigned long flags;
@@ -413,6 +492,33 @@ static inline void guest_exit(void)
 	local_irq_restore(flags);
 }
 
+/**
+ * guest_state_exit_irqoff - Establish state when returning from guest mode
+ *
+ * Entry from a guest disables interrupts, but guest mode is traced as
+ * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
+ *
+ * 1) Tell lockdep that interrupts are disabled
+ * 2) Invoke context tracking if enabled to reactivate RCU
+ * 3) Trace interrupts off state
+ *
+ * Invoked from architecture specific code after exiting a guest.
+ * Must be invoked with interrupts disabled and the caller must be
+ * non-instrumentable.
+ * The caller has to invoke guest_timing_exit_irqoff() after this.
+ *
+ * Note: this is analogous to enter_from_user_mode().
+ */
+static __always_inline void guest_state_exit_irqoff(void)
+{
+	lockdep_hardirqs_off(CALLER_ADDR0);
+	guest_context_exit_irqoff();
+
+	instrumentation_begin();
+	trace_hardirqs_off_finish();
+	instrumentation_end();
+}
+
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
 {
 	/*
-- 
cgit v1.2.3


From bee9f65523218e3baeeecde9295c8fbe9bc08e0a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 27 Jan 2022 16:02:50 +0000
Subject: netfs, cachefiles: Add a method to query presence of data in the
 cache

Add a netfs_cache_ops method by which a network filesystem can ask the
cache about what data it has available and where so that it can make a
multipage read more efficient.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-cachefs@redhat.com
Acked-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Rohith Surabattula <rohiths@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 include/linux/netfs.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index b46c39d98bbd..614f22213e21 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -244,6 +244,13 @@ struct netfs_cache_ops {
 	int (*prepare_write)(struct netfs_cache_resources *cres,
 			     loff_t *_start, size_t *_len, loff_t i_size,
 			     bool no_space_allocated_yet);
+
+	/* Query the occupancy of the cache in a region, returning where the
+	 * next chunk of data starts and how long it is.
+	 */
+	int (*query_occupancy)(struct netfs_cache_resources *cres,
+			       loff_t start, size_t len, size_t granularity,
+			       loff_t *_data_start, size_t *_data_len);
 };
 
 struct readahead_control;
-- 
cgit v1.2.3


From 6d5c900eb64107001e91e1f46bddc254dded8a59 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 24 Jan 2022 09:22:41 -0800
Subject: net/mlx5e: Use struct_group() for memcpy() region

In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memcpy(), memmove(), and memset(), avoid
intentionally writing across neighboring fields.

Use struct_group() in struct vlan_ethhdr around members h_dest and
h_source, so they can be referenced together. This will allow memcpy()
and sizeof() to more easily reason about sizes, improve readability,
and avoid future warnings about writing beyond the end of h_dest.

"pahole" shows no size nor member offset changes to struct vlan_ethhdr.
"objdump -d" shows no object code changes.

Fixes: 34802a42b352 ("net/mlx5e: Do not modify the TX SKB")
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/if_vlan.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 8420fe504927..2be4dd7e90a9 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -46,8 +46,10 @@ struct vlan_hdr {
  *	@h_vlan_encapsulated_proto: packet type ID or len
  */
 struct vlan_ethhdr {
-	unsigned char	h_dest[ETH_ALEN];
-	unsigned char	h_source[ETH_ALEN];
+	struct_group(addrs,
+		unsigned char	h_dest[ETH_ALEN];
+		unsigned char	h_source[ETH_ALEN];
+	);
 	__be16		h_vlan_proto;
 	__be16		h_vlan_TCI;
 	__be16		h_vlan_encapsulated_proto;
-- 
cgit v1.2.3


From 1148836fd3226c20de841084aba24184d4fbbe77 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Wed, 2 Feb 2022 14:55:29 +0100
Subject: Revert "fbdev: Garbage collect fbdev scrolling acceleration, part 1
 (from TODO list)"

This reverts commit b3ec8cdf457e5e63d396fe1346cc788cf7c1b578.

Revert the second (of 2) commits which disabled scrolling acceleration
in fbcon/fbdev.  It introduced a regression for fbdev-supported graphic
cards because of the performance penalty by doing screen scrolling by
software instead of using the existing graphic card 2D hardware
acceleration.

Console scrolling acceleration was disabled by dropping code which
checked at runtime the driver hardware capabilities for the
BINFO_HWACCEL_COPYAREA or FBINFO_HWACCEL_FILLRECT flags and if set, it
enabled scrollmode SCROLL_MOVE which uses hardware acceleration to move
screen contents.  After dropping those checks scrollmode was hard-wired
to SCROLL_REDRAW instead, which forces all graphic cards to redraw every
character at the new screen position when scrolling.

This change effectively disabled all hardware-based scrolling acceleration for
ALL drivers, because now all kind of 2D hardware acceleration (bitblt,
fillrect) in the drivers isn't used any longer.

The original commit message mentions that only 3 DRM drivers (nouveau, omapdrm
and gma500) used hardware acceleration in the past and thus code for checking
and using scrolling acceleration is obsolete.

This statement is NOT TRUE, because beside the DRM drivers there are around 35
other fbdev drivers which depend on fbdev/fbcon and still provide hardware
acceleration for fbdev/fbcon.

The original commit message also states that syzbot found lots of bugs in fbcon
and thus it's "often the solution to just delete code and remove features".
This is true, and the bugs - which actually affected all users of fbcon,
including DRM - were fixed, or code was dropped like e.g. the support for
software scrollback in vgacon (commit 973c096f6a85).

So to further analyze which bugs were found by syzbot, I've looked through all
patches in drivers/video which were tagged with syzbot or syzkaller back to
year 2005. The vast majority fixed the reported issues on a higher level, e.g.
when screen is to be resized, or when font size is to be changed. The few ones
which touched driver code fixed a real driver bug, e.g. by adding a check.

But NONE of those patches touched code of either the SCROLL_MOVE or the
SCROLL_REDRAW case.

That means, there was no real reason why SCROLL_MOVE had to be ripped-out and
just SCROLL_REDRAW had to be used instead. The only reason I can imagine so far
was that SCROLL_MOVE wasn't used by DRM and as such it was assumed that it
could go away. That argument completely missed the fact that SCROLL_MOVE is
still heavily used by fbdev (non-DRM) drivers.

Some people mention that using memcpy() instead of the hardware acceleration is
pretty much the same speed. But that's not true, at least not for older graphic
cards and machines where we see speed decreases by factor 10 and more and thus
this change leads to console responsiveness way worse than before.

That's why the original commit is to be reverted. By reverting we
reintroduce hardware-based scrolling acceleration and fix the
performance regression for fbdev drivers.

There isn't any impact on DRM when reverting those patches.

Signed-off-by: Helge Deller <deller@gmx.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Sven Schnelle <svens@stackframe.org>
Cc: stable@vger.kernel.org # v5.16+
Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20220202135531.92183-2-deller@gmx.de
---
 include/linux/fb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 3da95842b207..02f362c661c8 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -262,7 +262,7 @@ struct fb_ops {
 
 	/* Draws a rectangle */
 	void (*fb_fillrect) (struct fb_info *info, const struct fb_fillrect *rect);
-	/* Copy data from area to another. Obsolete. */
+	/* Copy data from area to another */
 	void (*fb_copyarea) (struct fb_info *info, const struct fb_copyarea *region);
 	/* Draws a image to the display */
 	void (*fb_imageblit) (struct fb_info *info, const struct fb_image *image);
-- 
cgit v1.2.3


From e1d2699b96793d19388e302fa095e0da2c145701 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 18 Jan 2022 22:10:52 -0500
Subject: NFS: Avoid duplicate uncached readdir calls on eof

If we've reached the end of the directory, then cache that information
in the context so that we don't need to do an uncached readdir in order
to rediscover that fact.

Fixes: 794092c57f89 ("NFS: Do uncached readdir when we're seeking a cookie in an empty page cache")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 02aa49323d1d..68f81d8d36de 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -107,6 +107,7 @@ struct nfs_open_dir_context {
 	__u64 dup_cookie;
 	pgoff_t page_index;
 	signed char duped;
+	bool eof;
 };
 
 /*
-- 
cgit v1.2.3


From 2ea88716369ac9a7486a8cb309d6bf1239ea156c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sun, 23 Jan 2022 17:27:47 +0100
Subject: libceph: make recv path in secure mode work the same as send path

The recv path of secure mode is intertwined with that of crc mode.
While it's slightly more efficient that way (the ciphertext is read
into the destination buffer and decrypted in place, thus avoiding
two potentially heavy memory allocations for the bounce buffer and
the corresponding sg array), it isn't really amenable to changes.
Sacrifice that edge and align with the send path which always uses
a full-sized bounce buffer (currently there is no other way -- if
the kernel crypto API ever grows support for streaming (piecewise)
en/decryption for GCM [1], we would be able to easily take advantage
of that on both sides).

[1] https://lore.kernel.org/all/20141225202830.GA18794@gondor.apana.org.au/

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/ceph/messenger.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index ff99ce094cfa..6c6b6ea52bb8 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -383,6 +383,10 @@ struct ceph_connection_v2_info {
 	struct ceph_gcm_nonce in_gcm_nonce;
 	struct ceph_gcm_nonce out_gcm_nonce;
 
+	struct page **in_enc_pages;
+	int in_enc_page_cnt;
+	int in_enc_resid;
+	int in_enc_i;
 	struct page **out_enc_pages;
 	int out_enc_page_cnt;
 	int out_enc_resid;
-- 
cgit v1.2.3


From 038b8d1d1ab1cce11a158d30bf080ff41a2cfd15 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 30 Dec 2021 15:13:32 +0100
Subject: libceph: optionally use bounce buffer on recv path in crc mode

Both msgr1 and msgr2 in crc mode are zero copy in the sense that
message data is read from the socket directly into the destination
buffer.  We assume that the destination buffer is stable (i.e. remains
unchanged while it is being read to) though.  Otherwise, CRC errors
ensue:

  libceph: read_partial_message 0000000048edf8ad data crc 1063286393 != exp. 228122706
  libceph: osd1 (1)192.168.122.1:6843 bad crc/signature

  libceph: bad data crc, calculated 57958023, expected 1805382778
  libceph: osd2 (2)192.168.122.1:6876 integrity error, bad crc

Introduce rxbounce option to enable use of a bounce buffer when
receiving message data.  In particular this is needed if a mapped
image is a Windows VM disk, passed to QEMU.  Windows has a system-wide
"dummy" page that may be mapped into the destination buffer (potentially
more than once into the same buffer) by the Windows Memory Manager in
an effort to generate a single large I/O [1][2].  QEMU makes a point of
preserving overlap relationships when cloning I/O vectors, so krbd gets
exposed to this behaviour.

[1] "What Is Really in That MDL?"
    https://docs.microsoft.com/en-us/previous-versions/windows/hardware/design/dn614012(v=vs.85)
[2] https://blogs.msmvps.com/kernelmustard/2005/05/04/dummy-pages/

URL: https://bugzilla.redhat.com/show_bug.cgi?id=1973317
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/ceph/libceph.h   | 1 +
 include/linux/ceph/messenger.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 6a89ea410e43..edf62eaa6285 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -35,6 +35,7 @@
 #define CEPH_OPT_TCP_NODELAY      (1<<4) /* TCP_NODELAY on TCP sockets */
 #define CEPH_OPT_NOMSGSIGN        (1<<5) /* don't sign msgs (msgr1) */
 #define CEPH_OPT_ABORT_ON_FULL    (1<<6) /* abort w/ ENOSPC when full */
+#define CEPH_OPT_RXBOUNCE         (1<<7) /* double-buffer read data */
 
 #define CEPH_OPT_DEFAULT   (CEPH_OPT_TCP_NODELAY)
 
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 6c6b6ea52bb8..e7f2fb2fc207 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -461,6 +461,7 @@ struct ceph_connection {
 	struct ceph_msg *out_msg;        /* sending message (== tail of
 					    out_sent) */
 
+	struct page *bounce_page;
 	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
 
 	struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */
-- 
cgit v1.2.3


From bfb1a7c91fb7758273b4a8d735313d9cc388b502 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Wed, 2 Feb 2022 12:55:53 -0800
Subject: x86/bug: Merge annotate_reachable() into _BUG_FLAGS() asm

In __WARN_FLAGS(), we had two asm statements (abbreviated):

  asm volatile("ud2");
  asm volatile(".pushsection .discard.reachable");

These pair of statements are used to trigger an exception, but then help
objtool understand that for warnings, control flow will be restored
immediately afterwards.

The problem is that volatile is not a compiler barrier. GCC explicitly
documents this:

> Note that the compiler can move even volatile asm instructions
> relative to other code, including across jump instructions.

Also, no clobbers are specified to prevent instructions from subsequent
statements from being scheduled by compiler before the second asm
statement. This can lead to instructions from subsequent statements
being emitted by the compiler before the second asm statement.

Providing a scheduling model such as via -march= options enables the
compiler to better schedule instructions with known latencies to hide
latencies from data hazards compared to inline asm statements in which
latencies are not estimated.

If an instruction gets scheduled by the compiler between the two asm
statements, then objtool will think that it is not reachable, producing
a warning.

To prevent instructions from being scheduled in between the two asm
statements, merge them.

Also remove an unnecessary unreachable() asm annotation from BUG() in
favor of __builtin_unreachable(). objtool is able to track that the ud2
from BUG() terminates control flow within the function.

Link: https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Volatile
Link: https://github.com/ClangBuiltLinux/linux/issues/1483
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20220202205557.2260694-1-ndesaulniers@google.com
---
 include/linux/compiler.h | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 429dcebe2b99..0f7fd205ab7e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -117,14 +117,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
  */
 #define __stringify_label(n) #n
 
-#define __annotate_reachable(c) ({					\
-	asm volatile(__stringify_label(c) ":\n\t"			\
-		     ".pushsection .discard.reachable\n\t"		\
-		     ".long " __stringify_label(c) "b - .\n\t"		\
-		     ".popsection\n\t" : : "i" (c));			\
-})
-#define annotate_reachable() __annotate_reachable(__COUNTER__)
-
 #define __annotate_unreachable(c) ({					\
 	asm volatile(__stringify_label(c) ":\n\t"			\
 		     ".pushsection .discard.unreachable\n\t"		\
@@ -133,24 +125,21 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 })
 #define annotate_unreachable() __annotate_unreachable(__COUNTER__)
 
-#define ASM_UNREACHABLE							\
-	"999:\n\t"							\
-	".pushsection .discard.unreachable\n\t"				\
-	".long 999b - .\n\t"						\
+#define ASM_REACHABLE							\
+	"998:\n\t"							\
+	".pushsection .discard.reachable\n\t"				\
+	".long 998b - .\n\t"						\
 	".popsection\n\t"
 
 /* Annotate a C jump table to allow objtool to follow the code flow */
 #define __annotate_jump_table __section(".rodata..c_jump_table")
 
 #else
-#define annotate_reachable()
 #define annotate_unreachable()
+# define ASM_REACHABLE
 #define __annotate_jump_table
 #endif
 
-#ifndef ASM_UNREACHABLE
-# define ASM_UNREACHABLE
-#endif
 #ifndef unreachable
 # define unreachable() do {		\
 	annotate_unreachable();		\
-- 
cgit v1.2.3


From e85c81ba8859a4c839bcd69c5d83b32954133a5b Mon Sep 17 00:00:00 2001
From: Xin Yin <yinxin.x@bytedance.com>
Date: Mon, 17 Jan 2022 17:36:54 +0800
Subject: ext4: fast commit may not fallback for ineligible commit

For the follow scenario:
1. jbd start commit transaction n
2. task A get new handle for transaction n+1
3. task A do some ineligible actions and mark FC_INELIGIBLE
4. jbd complete transaction n and clean FC_INELIGIBLE
5. task A call fsync

In this case fast commit will not fallback to full commit and
transaction n+1 also not handled by jbd.

Make ext4_fc_mark_ineligible() also record transaction tid for
latest ineligible case, when call ext4_fc_cleanup() check
current transaction tid, if small than latest ineligible tid
do not clear the EXT4_MF_FC_INELIGIBLE.

Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reported-by: Ritesh Harjani <riteshh@linux.ibm.com>
Suggested-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
Link: https://lore.kernel.org/r/20220117093655.35160-2-yinxin.x@bytedance.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 include/linux/jbd2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index fd933c45281a..d63b8106796e 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1295,7 +1295,7 @@ struct journal_s
 	 * Clean-up after fast commit or full commit. JBD2 calls this function
 	 * after every commit operation.
 	 */
-	void (*j_fc_cleanup_callback)(struct journal_s *journal, int);
+	void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid);
 
 	/**
 	 * @j_fc_replay_callback:
-- 
cgit v1.2.3


From 3ca40c0d329113a9f76f6aa01abe73d9f16ace9d Mon Sep 17 00:00:00 2001
From: Ritesh Harjani <riteshh@linux.ibm.com>
Date: Mon, 17 Jan 2022 17:41:50 +0530
Subject: jbd2: cleanup unused functions declarations from jbd2.h

During code review found no references of few of these below function
declarations. This patch cleans those up from jbd2.h

Signed-off-by: Ritesh Harjani <riteshh@linux.ibm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/30d1fc327becda197a4136cf9cdc73d9baa3b7b9.1642416995.git.riteshh@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d63b8106796e..afc5572e7b8a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1419,9 +1419,7 @@ extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
 extern bool __jbd2_journal_refile_buffer(struct journal_head *);
 extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
 extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
-extern void __journal_free_buffer(struct journal_head *bh);
 extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
-extern void __journal_clean_data_list(transaction_t *transaction);
 static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh)
 {
 	list_add_tail(&bh->b_assoc_buffers, head);
@@ -1486,9 +1484,6 @@ extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 					      struct buffer_head **bh_out,
 					      sector_t blocknr);
 
-/* Transaction locking */
-extern void		__wait_on_journal (journal_t *);
-
 /* Transaction cache support */
 extern void jbd2_journal_destroy_transaction_cache(void);
 extern int __init jbd2_journal_init_transaction_cache(void);
@@ -1774,8 +1769,6 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal)
 #define BJ_Reserved	4	/* Buffer is reserved for access by journal */
 #define BJ_Types	5
 
-extern int jbd_blocks_per_page(struct inode *inode);
-
 /* JBD uses a CRC32 checksum */
 #define JBD_MAX_CHECKSUM_SIZE 4
 
-- 
cgit v1.2.3


From 4f98186848707f530669238d90e0562d92a78aab Mon Sep 17 00:00:00 2001
From: Ritesh Harjani <riteshh@linux.ibm.com>
Date: Mon, 17 Jan 2022 17:41:51 +0530
Subject: jbd2: refactor wait logic for transaction updates into a common
 function

No functionality change as such in this patch. This only refactors the
common piece of code which waits for t_updates to finish into a common
function named as jbd2_journal_wait_updates(journal_t *)

Signed-off-by: Ritesh Harjani <riteshh@linux.ibm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/8c564f70f4b2591171677a2a74fccb22a7b6c3a4.1642416995.git.riteshh@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index afc5572e7b8a..9c3ada74ffb1 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -594,7 +594,7 @@ struct transaction_s
 	 */
 	unsigned long		t_log_start;
 
-	/* 
+	/*
 	 * Number of buffers on the t_buffers list [j_list_lock, no locks
 	 * needed for jbd2 thread]
 	 */
@@ -1538,6 +1538,8 @@ extern int	 jbd2_journal_flush(journal_t *journal, unsigned int flags);
 extern void	 jbd2_journal_lock_updates (journal_t *);
 extern void	 jbd2_journal_unlock_updates (journal_t *);
 
+void jbd2_journal_wait_updates(journal_t *);
+
 extern journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 				struct block_device *fs_dev,
 				unsigned long long start, int len, int bsize);
-- 
cgit v1.2.3


From 67d6212afda218d564890d1674bab28e8612170f Mon Sep 17 00:00:00 2001
From: Igor Pylypiv <ipylypiv@google.com>
Date: Thu, 27 Jan 2022 15:39:53 -0800
Subject: Revert "module, async: async_synchronize_full() on module init iff
 async is used"

This reverts commit 774a1221e862b343388347bac9b318767336b20b.

We need to finish all async code before the module init sequence is
done.  In the reverted commit the PF_USED_ASYNC flag was added to mark a
thread that called async_schedule().  Then the PF_USED_ASYNC flag was
used to determine whether or not async_synchronize_full() needs to be
invoked.  This works when modprobe thread is calling async_schedule(),
but it does not work if module dispatches init code to a worker thread
which then calls async_schedule().

For example, PCI driver probing is invoked from a worker thread based on
a node where device is attached:

	if (cpu < nr_cpu_ids)
		error = work_on_cpu(cpu, local_pci_probe, &ddi);
	else
		error = local_pci_probe(&ddi);

We end up in a situation where a worker thread gets the PF_USED_ASYNC
flag set instead of the modprobe thread.  As a result,
async_synchronize_full() is not invoked and modprobe completes without
waiting for the async code to finish.

The issue was discovered while loading the pm80xx driver:
(scsi_mod.scan=async)

modprobe pm80xx                      worker
...
  do_init_module()
  ...
    pci_call_probe()
      work_on_cpu(local_pci_probe)
                                     local_pci_probe()
                                       pm8001_pci_probe()
                                         scsi_scan_host()
                                           async_schedule()
                                           worker->flags |= PF_USED_ASYNC;
                                     ...
      < return from worker >
  ...
  if (current->flags & PF_USED_ASYNC) <--- false
  	async_synchronize_full();

Commit 21c3c5d28007 ("block: don't request module during elevator init")
fixed the deadlock issue which the reverted commit 774a1221e862
("module, async: async_synchronize_full() on module init iff async is
used") tried to fix.

Since commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
request_module() from async workers") synchronous module loading from
async is not allowed.

Given that the original deadlock issue is fixed and it is no longer
allowed to call synchronous request_module() from async we can remove
PF_USED_ASYNC flag to make module init consistently invoke
async_synchronize_full() unless async module probe is requested.

Signed-off-by: Igor Pylypiv <ipylypiv@google.com>
Reviewed-by: Changyuan Lyu <changyuanl@google.com>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f5b2be39a78c..75ba8aa60248 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1680,7 +1680,6 @@ extern struct pid *cad_pid;
 #define PF_MEMALLOC		0x00000800	/* Allocating memory */
 #define PF_NPROC_EXCEEDED	0x00001000	/* set_user() noticed that RLIMIT_NPROC was exceeded */
 #define PF_USED_MATH		0x00002000	/* If unset the fpu must be initialized before use */
-#define PF_USED_ASYNC		0x00004000	/* Used async_schedule*(), used by module init */
 #define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
 #define PF_FROZEN		0x00010000	/* Frozen for system suspend */
 #define PF_KSWAPD		0x00020000	/* I am kswapd */
-- 
cgit v1.2.3


From ac9f0c810684a1b161c18eb4b91ce84cbc13c91d Mon Sep 17 00:00:00 2001
From: Anton Lundin <glance@acc.umu.se>
Date: Thu, 3 Feb 2022 10:41:35 +0100
Subject: ata: libata-core: Introduce ATA_HORKAGE_NO_LOG_DIR horkage

06f6c4c6c3e8 ("ata: libata: add missing ata_identify_page_supported() calls")
introduced additional calls to ata_identify_page_supported(), thus also
adding indirectly accesses to the device log directory log page through
ata_log_supported(). Reading this log page causes SATADOM-ML 3ME devices
to lock up.

Introduce the horkage flag ATA_HORKAGE_NO_LOG_DIR to prevent accesses to
the log directory in ata_log_supported() and add a blacklist entry
with this flag for "SATADOM-ML 3ME" devices.

Fixes: 636f6e2af4fb ("libata: add horkage for missing Identify Device log")
Cc: stable@vger.kernel.org # v5.10+
Signed-off-by: Anton Lundin <glance@acc.umu.se>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 605756f645be..7f99b4d78822 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -380,6 +380,7 @@ enum {
 	ATA_HORKAGE_MAX_TRIM_128M = (1 << 26),	/* Limit max trim size to 128M */
 	ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27),	/* Disable NCQ on ATI chipset */
 	ATA_HORKAGE_NO_ID_DEV_LOG = (1 << 28),	/* Identify device log missing */
+	ATA_HORKAGE_NO_LOG_DIR	= (1 << 29),	/* Do not read log directory */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3


From 80110bbfbba6f0078d5a1cbc8df004506db8ffe5 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Thu, 3 Feb 2022 20:49:24 -0800
Subject: mm/page_table_check: check entries at pmd levels

syzbot detected a case where the page table counters were not properly
updated.

  syzkaller login:  ------------[ cut here ]------------
  kernel BUG at mm/page_table_check.c:162!
  invalid opcode: 0000 [#1] PREEMPT SMP KASAN
  CPU: 0 PID: 3099 Comm: pasha Not tainted 5.16.0+ #48
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO4
  RIP: 0010:__page_table_check_zero+0x159/0x1a0
  Call Trace:
   free_pcp_prepare+0x3be/0xaa0
   free_unref_page+0x1c/0x650
   free_compound_page+0xec/0x130
   free_transhuge_page+0x1be/0x260
   __put_compound_page+0x90/0xd0
   release_pages+0x54c/0x1060
   __pagevec_release+0x7c/0x110
   shmem_undo_range+0x85e/0x1250
  ...

The repro involved having a huge page that is split due to uprobe event
temporarily replacing one of the pages in the huge page.  Later the huge
page was combined again, but the counters were off, as the PTE level was
not properly updated.

Make sure that when PMD is cleared and prior to freeing the level the
PTEs are updated.

Link: https://lkml.kernel.org/r/20220131203249.2832273-5-pasha.tatashin@soleen.com
Fixes: df4e817b7108 ("mm: page table check")
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Paul Turner <pjt@google.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_table_check.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h
index 38cace1da7b6..01e16c7696ec 100644
--- a/include/linux/page_table_check.h
+++ b/include/linux/page_table_check.h
@@ -26,6 +26,9 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
 				pmd_t *pmdp, pmd_t pmd);
 void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
 				pud_t *pudp, pud_t pud);
+void __page_table_check_pte_clear_range(struct mm_struct *mm,
+					unsigned long addr,
+					pmd_t pmd);
 
 static inline void page_table_check_alloc(struct page *page, unsigned int order)
 {
@@ -100,6 +103,16 @@ static inline void page_table_check_pud_set(struct mm_struct *mm,
 	__page_table_check_pud_set(mm, addr, pudp, pud);
 }
 
+static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
+						    unsigned long addr,
+						    pmd_t pmd)
+{
+	if (static_branch_likely(&page_table_check_disabled))
+		return;
+
+	__page_table_check_pte_clear_range(mm, addr, pmd);
+}
+
 #else
 
 static inline void page_table_check_alloc(struct page *page, unsigned int order)
@@ -143,5 +156,11 @@ static inline void page_table_check_pud_set(struct mm_struct *mm,
 {
 }
 
+static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
+						    unsigned long addr,
+						    pmd_t pmd)
+{
+}
+
 #endif /* CONFIG_PAGE_TABLE_CHECK */
 #endif /* __LINUX_PAGE_TABLE_CHECK_H */
-- 
cgit v1.2.3


From 314c459a6fe0957b5885fbc65c53d51444092880 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Thu, 3 Feb 2022 20:49:29 -0800
Subject: mm/pgtable: define pte_index so that preprocessor could recognize it

Since commit 974b9b2c68f3 ("mm: consolidate pte_index() and
pte_offset_*() definitions") pte_index is a static inline and there is
no define for it that can be recognized by the preprocessor.  As a
result, vm_insert_pages() uses slower loop over vm_insert_page() instead
of insert_pages() that amortizes the cost of spinlock operations when
inserting multiple pages.

Link: https://lkml.kernel.org/r/20220111145457.20748-1-rppt@kernel.org
Fixes: 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions")
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reported-by: Christian Dietrich <stettberger@dokucode.de>
Reviewed-by: Khalid Aziz <khalid.aziz@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pgtable.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index bc8713a76e03..f4f4077b97aa 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -62,6 +62,7 @@ static inline unsigned long pte_index(unsigned long address)
 {
 	return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
 }
+#define pte_index pte_index
 
 #ifndef pmd_index
 static inline unsigned long pmd_index(unsigned long address)
-- 
cgit v1.2.3


From fda17afc6166e975bec1197bd94cd2a3317bce3f Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Mon, 7 Feb 2022 11:27:53 +0900
Subject: ata: libata-core: Fix ata_dev_config_cpr()

The concurrent positioning ranges log page 47h is a general purpose log
page and not a subpage of the indentify device log. Using
ata_identify_page_supported() to test for concurrent positioning ranges
support is thus wrong. ata_log_supported() must be used.

Furthermore, unlike other advanced ATA features (e.g. NCQ priority),
accesses to the concurrent positioning ranges log page are not gated by
a feature bit from the device IDENTIFY data. Since many older drives
react badly to the READ LOG EXT and/or READ LOG DMA EXT commands isued
to read device log pages, avoid problems with older drives by limiting
the concurrent positioning ranges support detection to drives
implementing at least the ACS-4 ATA standard (major version 11). This
additional condition effectively turns ata_dev_config_cpr() into a nop
for older drives, avoiding problems in the field.

Fixes: fe22e1c2f705 ("libata: support concurrent positioning ranges log")
BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215519
Cc: stable@vger.kernel.org
Reviewed-by: Hannes Reinecke <hare@suse.de>
Tested-by: Abderraouf Adjal <adjal.arf@gmail.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/ata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 199e47e97d64..21292b5bbb55 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -324,12 +324,12 @@ enum {
 	ATA_LOG_NCQ_NON_DATA	= 0x12,
 	ATA_LOG_NCQ_SEND_RECV	= 0x13,
 	ATA_LOG_IDENTIFY_DEVICE	= 0x30,
+	ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47,
 
 	/* Identify device log pages: */
 	ATA_LOG_SECURITY	  = 0x06,
 	ATA_LOG_SATA_SETTINGS	  = 0x08,
 	ATA_LOG_ZONED_INFORMATION = 0x09,
-	ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47,
 
 	/* Identify device SATA settings log:*/
 	ATA_LOG_DEVSLP_OFFSET	  = 0x30,
-- 
cgit v1.2.3


From cb1f65c1e1424a4b5e4a86da8aa3b8fd8459c8ec Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 4 Feb 2022 18:35:22 +0100
Subject: PM: s2idle: ACPI: Fix wakeup interrupts handling

After commit e3728b50cd9b ("ACPI: PM: s2idle: Avoid possible race
related to the EC GPE") wakeup interrupts occurring immediately after
the one discarded by acpi_s2idle_wake() may be missed.  Moreover, if
the SCI triggers again immediately after the rearming in
acpi_s2idle_wake(), that wakeup may be missed too.

The problem is that pm_system_irq_wakeup() only calls pm_system_wakeup()
when pm_wakeup_irq is 0, but that's not the case any more after the
interrupt causing acpi_s2idle_wake() to run until pm_wakeup_irq is
cleared by the pm_wakeup_clear() call in s2idle_loop().  However,
there may be wakeup interrupts occurring in that time frame and if
that happens, they will be missed.

To address that issue first move the clearing of pm_wakeup_irq to
the point at which it is known that the interrupt causing
acpi_s2idle_wake() to tun will be discarded, before rearming the SCI
for wakeup.  Moreover, because that only reduces the size of the
time window in which the issue may manifest itself, allow
pm_system_irq_wakeup() to register two second wakeup interrupts in
a row and, when discarding the first one, replace it with the second
one.  [Of course, this assumes that only one wakeup interrupt can be
discarded in one go, but currently that is the case and I am not
aware of any plans to change that.]

Fixes: e3728b50cd9b ("ACPI: PM: s2idle: Avoid possible race related to the EC GPE")
Cc: 5.4+ <stable@vger.kernel.org> # 5.4+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/suspend.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 3e8ecdebe601..300273ff40cc 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -497,14 +497,14 @@ extern void ksys_sync_helper(void);
 
 /* drivers/base/power/wakeup.c */
 extern bool events_check_enabled;
-extern unsigned int pm_wakeup_irq;
 extern suspend_state_t pm_suspend_target_state;
 
 extern bool pm_wakeup_pending(void);
 extern void pm_system_wakeup(void);
 extern void pm_system_cancel_wakeup(void);
-extern void pm_wakeup_clear(bool reset);
+extern void pm_wakeup_clear(unsigned int irq_number);
 extern void pm_system_irq_wakeup(unsigned int irq_number);
+extern unsigned int pm_wakeup_irq(void);
 extern bool pm_get_wakeup_count(unsigned int *count, bool block);
 extern bool pm_save_wakeup_count(unsigned int count);
 extern void pm_wakep_autosleep_enabled(bool set);
-- 
cgit v1.2.3


From c306d737691ef84305d4ed0d302c63db2932f0bb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 25 Jan 2022 15:57:45 -0500
Subject: NFSD: Deprecate NFS_OFFSET_MAX

NFS_OFFSET_MAX was introduced way back in Linux v2.3.y before there
was a kernel-wide OFFSET_MAX value. As a clean up, replace the last
few uses of it with its generic equivalent, and get rid of it.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/nfs.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index 0dc7ad38a0da..b06375e88e58 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -36,14 +36,6 @@ static inline void nfs_copy_fh(struct nfs_fh *target, const struct nfs_fh *sourc
 	memcpy(target->data, source->data, source->size);
 }
 
-
-/*
- * This is really a general kernel constant, but since nothing like
- * this is defined in the kernel headers, I have to do it here.
- */
-#define NFS_OFFSET_MAX		((__s64)((~(__u64)0) >> 1))
-
-
 enum nfs3_stable_how {
 	NFS_UNSTABLE = 0,
 	NFS_DATA_SYNC = 1,
-- 
cgit v1.2.3


From 0764db9b49c932b89ee4d9e3236dff4bb07b4a66 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 11 Feb 2022 16:32:32 -0800
Subject: mm: memcg: synchronize objcg lists with a dedicated spinlock

Alexander reported a circular lock dependency revealed by the mmap1 ltp
test:

  LOCKDEP_CIRCULAR (suite: ltp, case: mtest06 (mmap1))
          WARNING: possible circular locking dependency detected
          5.17.0-20220113.rc0.git0.f2211f194038.300.fc35.s390x+debug #1 Not tainted
          ------------------------------------------------------
          mmap1/202299 is trying to acquire lock:
          00000001892c0188 (css_set_lock){..-.}-{2:2}, at: obj_cgroup_release+0x4a/0xe0
          but task is already holding lock:
          00000000ca3b3818 (&sighand->siglock){-.-.}-{2:2}, at: force_sig_info_to_task+0x38/0x180
          which lock already depends on the new lock.
          the existing dependency chain (in reverse order) is:
          -> #1 (&sighand->siglock){-.-.}-{2:2}:
                 __lock_acquire+0x604/0xbd8
                 lock_acquire.part.0+0xe2/0x238
                 lock_acquire+0xb0/0x200
                 _raw_spin_lock_irqsave+0x6a/0xd8
                 __lock_task_sighand+0x90/0x190
                 cgroup_freeze_task+0x2e/0x90
                 cgroup_migrate_execute+0x11c/0x608
                 cgroup_update_dfl_csses+0x246/0x270
                 cgroup_subtree_control_write+0x238/0x518
                 kernfs_fop_write_iter+0x13e/0x1e0
                 new_sync_write+0x100/0x190
                 vfs_write+0x22c/0x2d8
                 ksys_write+0x6c/0xf8
                 __do_syscall+0x1da/0x208
                 system_call+0x82/0xb0
          -> #0 (css_set_lock){..-.}-{2:2}:
                 check_prev_add+0xe0/0xed8
                 validate_chain+0x736/0xb20
                 __lock_acquire+0x604/0xbd8
                 lock_acquire.part.0+0xe2/0x238
                 lock_acquire+0xb0/0x200
                 _raw_spin_lock_irqsave+0x6a/0xd8
                 obj_cgroup_release+0x4a/0xe0
                 percpu_ref_put_many.constprop.0+0x150/0x168
                 drain_obj_stock+0x94/0xe8
                 refill_obj_stock+0x94/0x278
                 obj_cgroup_charge+0x164/0x1d8
                 kmem_cache_alloc+0xac/0x528
                 __sigqueue_alloc+0x150/0x308
                 __send_signal+0x260/0x550
                 send_signal+0x7e/0x348
                 force_sig_info_to_task+0x104/0x180
                 force_sig_fault+0x48/0x58
                 __do_pgm_check+0x120/0x1f0
                 pgm_check_handler+0x11e/0x180
          other info that might help us debug this:
           Possible unsafe locking scenario:
                 CPU0                    CPU1
                 ----                    ----
            lock(&sighand->siglock);
                                         lock(css_set_lock);
                                         lock(&sighand->siglock);
            lock(css_set_lock);
           *** DEADLOCK ***
          2 locks held by mmap1/202299:
           #0: 00000000ca3b3818 (&sighand->siglock){-.-.}-{2:2}, at: force_sig_info_to_task+0x38/0x180
           #1: 00000001892ad560 (rcu_read_lock){....}-{1:2}, at: percpu_ref_put_many.constprop.0+0x0/0x168
          stack backtrace:
          CPU: 15 PID: 202299 Comm: mmap1 Not tainted 5.17.0-20220113.rc0.git0.f2211f194038.300.fc35.s390x+debug #1
          Hardware name: IBM 3906 M04 704 (LPAR)
          Call Trace:
            dump_stack_lvl+0x76/0x98
            check_noncircular+0x136/0x158
            check_prev_add+0xe0/0xed8
            validate_chain+0x736/0xb20
            __lock_acquire+0x604/0xbd8
            lock_acquire.part.0+0xe2/0x238
            lock_acquire+0xb0/0x200
            _raw_spin_lock_irqsave+0x6a/0xd8
            obj_cgroup_release+0x4a/0xe0
            percpu_ref_put_many.constprop.0+0x150/0x168
            drain_obj_stock+0x94/0xe8
            refill_obj_stock+0x94/0x278
            obj_cgroup_charge+0x164/0x1d8
            kmem_cache_alloc+0xac/0x528
            __sigqueue_alloc+0x150/0x308
            __send_signal+0x260/0x550
            send_signal+0x7e/0x348
            force_sig_info_to_task+0x104/0x180
            force_sig_fault+0x48/0x58
            __do_pgm_check+0x120/0x1f0
            pgm_check_handler+0x11e/0x180
          INFO: lockdep is turned off.

In this example a slab allocation from __send_signal() caused a
refilling and draining of a percpu objcg stock, resulted in a releasing
of another non-related objcg.  Objcg release path requires taking the
css_set_lock, which is used to synchronize objcg lists.

This can create a circular dependency with the sighandler lock, which is
taken with the locked css_set_lock by the freezer code (to freeze a
task).

In general it seems that using css_set_lock to synchronize objcg lists
makes any slab allocations and deallocation with the locked css_set_lock
and any intervened locks risky.

To fix the problem and make the code more robust let's stop using
css_set_lock to synchronize objcg lists and use a new dedicated spinlock
instead.

Link: https://lkml.kernel.org/r/Yfm1IHmoGdyUR81T@carbon.dhcp.thefacebook.com
Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API")
Signed-off-by: Roman Gushchin <guro@fb.com>
Reported-by: Alexander Egorenkov <egorenar@linux.ibm.com>
Tested-by: Alexander Egorenkov <egorenar@linux.ibm.com>
Reviewed-by: Waiman Long <longman@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Jeremy Linton <jeremy.linton@arm.com>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b72d75141e12..0abbd685703b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -219,7 +219,7 @@ struct obj_cgroup {
 	struct mem_cgroup *memcg;
 	atomic_t nr_charged_bytes;
 	union {
-		struct list_head list;
+		struct list_head list; /* protected by objcg_lock */
 		struct rcu_head rcu;
 	};
 };
@@ -315,7 +315,8 @@ struct mem_cgroup {
 #ifdef CONFIG_MEMCG_KMEM
 	int kmemcg_id;
 	struct obj_cgroup __rcu *objcg;
-	struct list_head objcg_list; /* list of inherited objcgs */
+	/* list of inherited objcgs, protected by objcg_lock */
+	struct list_head objcg_list;
 #endif
 
 	MEMCG_PADDING(_pad2_);
-- 
cgit v1.2.3


From 8913c61001482378d4ed8cc577b17c1ba3e847e4 Mon Sep 17 00:00:00 2001
From: Peng Liu <liupeng256@huawei.com>
Date: Fri, 11 Feb 2022 16:32:35 -0800
Subject: kfence: make test case compatible with run time set sample interval

The parameter kfence_sample_interval can be set via boot parameter and
late shell command, which is convenient for automated tests and KFENCE
parameter optimization.  However, KFENCE test case just uses
compile-time CONFIG_KFENCE_SAMPLE_INTERVAL, which will make KFENCE test
case not run as users desired.  Export kfence_sample_interval, so that
KFENCE test case can use run-time-set sample interval.

Link: https://lkml.kernel.org/r/20220207034432.185532-1-liupeng256@huawei.com
Signed-off-by: Peng Liu <liupeng256@huawei.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Christian Knig <christian.koenig@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kfence.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 4b5e3679a72c..f49e64222628 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -17,6 +17,8 @@
 #include <linux/atomic.h>
 #include <linux/static_key.h>
 
+extern unsigned long kfence_sample_interval;
+
 /*
  * We allocate an even number of pages, as it simplifies calculations to map
  * address to metadata indices; effectively, the very first page serves as an
-- 
cgit v1.2.3